[Mesa-dev] [PATCH] llvmpipe: handle offset_clamp

2013-06-26 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This was just ignored (unless for some reason like unfilled polys draw was
handling this).
I'm not convinced of that code, putting the float for the clamp in the key
isn't really a good idea. Then again the other floats for depth bias are
already in there too anyway (should probably have a jit_context for the
setup function), so this is just a quick fix.
Also, the minimum resolvable depth difference used isn't really right as it
should be calculated according to the z values of the current primitive
and not be a constant (of course, this only makes a difference for float
depth buffers), at least for d3d10, so depth biasing is still not quite right.
---
 src/gallium/drivers/llvmpipe/lp_state_setup.c   |   20 +++-
 src/gallium/drivers/llvmpipe/lp_state_setup.h   |3 ++-
 src/gallium/drivers/llvmpipe/lp_state_surface.c |2 ++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c 
b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index ed68b98..2988bed 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -244,6 +244,7 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
 {
LLVMBuilderRef b = gallivm-builder;
struct lp_build_context bld;
+   struct lp_build_context flt_scalar_bld;
LLVMValueRef zoffset, mult;
LLVMValueRef z0_new, z1_new, z2_new;
LLVMValueRef dzdxdzdy, dzdx, dzdy, dzxyz20, dyzzx01, dyzzx01_dzxyz20, 
dzx01_dyz20;
@@ -298,6 +299,18 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
lp_build_const_float(gallivm, 
key-pgon_offset_units),
mult, zoffset);
 
+   lp_build_context_init(flt_scalar_bld, gallivm, lp_type_float_vec(32, 32));
+   if (key-pgon_offset_clamp  0) {
+  zoffset = lp_build_min(flt_scalar_bld,
+ lp_build_const_float(gallivm, 
key-pgon_offset_clamp),
+ zoffset);
+   }
+   else if (key-pgon_offset_clamp  0) {
+  zoffset = lp_build_max(flt_scalar_bld,
+ lp_build_const_float(gallivm, 
key-pgon_offset_clamp),
+ zoffset);
+   }
+
/* yuck */
shuffles[0] = twoi;
shuffles[1] = lp_build_const_int32(gallivm, 6);
@@ -312,6 +325,10 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
zoffset = vec4f_from_scalar(gallivm, zoffset, );
 
/* clamp and do offset */
+   /*
+* XXX I suspect the clamp (is that even right to always clamp to fixed 
0.0/1.0?)
+* should really be per fragment?
+*/
z0z1z2 = lp_build_clamp(bld, LLVMBuildFAdd(b, z0z1z2, zoffset, ), 
bld.zero, bld.one);
 
/* insert into args-a0.z, a1.z, a2.z:
@@ -810,7 +827,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
key-pixel_center_half = lp-rasterizer-half_pixel_center;
key-twoside = lp-rasterizer-light_twoside;
key-size = Offset(struct lp_setup_variant_key,
- inputs[key-num_inputs]);
+  inputs[key-num_inputs]);
 
key-color_slot  = lp-color_slot [0];
key-bcolor_slot = lp-bcolor_slot[0];
@@ -823,6 +840,7 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
 
key-pgon_offset_units = (float) (lp-rasterizer-offset_units * lp-mrd);
key-pgon_offset_scale = lp-rasterizer-offset_scale;
+   key-pgon_offset_clamp = lp-rasterizer-offset_clamp;
key-pad = 0;
memcpy(key-inputs, fs-inputs, key-num_inputs * sizeof key-inputs[0]);
for (i = 0; i  key-num_inputs; i++) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h 
b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 73d40a5..c2a2c7f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -14,7 +14,7 @@ struct lp_setup_variant_list_item
 };
 
 
-struct lp_setup_variant_key {   
+struct lp_setup_variant_key {
unsigned size:16;
unsigned num_inputs:8;
int color_slot:8;
@@ -29,6 +29,7 @@ struct lp_setup_variant_key {
 
float pgon_offset_units;
float pgon_offset_scale;
+   float pgon_offset_clamp;
struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
 };
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c 
b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index 375ceb2..e6aac31 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -65,6 +65,8 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
   }
 
   /* Tell draw module how deep the Z/depth buffer is */
+  /* FIXME: mrd constant isn't right should use a value derived from
+   * current primitive not a constant (for float depth buffers) */
   if (lp-framebuffer.zsbuf) {
  int depth_bits;
  double mrd;
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org

[Mesa-dev] [PATCH 1/2] llvmpipe: add support for nested / overlapping queries

2013-06-25 Thread sroland
From: Roland Scheidegger srol...@vmware.com

OpenGL doesn't support this but d3d10 does.
It is a bit of a pain as it is necessary to keep track of queries
still active at the end of a scene, which is also why I cheat a bit
and limit the amount of simultaneously active queries to (arbitrary)
16 (simplifies things because don't have to deal with a real list
that way). I can't think of a reason why you'd really want large
numbers of overlapping/nested queries so it is hopefully fine.
(This only affects queries which need to be binned.)
---
 src/gallium/drivers/llvmpipe/lp_context.h   |4 +-
 src/gallium/drivers/llvmpipe/lp_query.c |6 +--
 src/gallium/drivers/llvmpipe/lp_rast.h  |2 +
 src/gallium/drivers/llvmpipe/lp_scene.h |2 +-
 src/gallium/drivers/llvmpipe/lp_setup.c |   58 +--
 src/gallium/drivers/llvmpipe/lp_setup_context.h |3 +-
 src/gallium/drivers/llvmpipe/lp_setup_line.c|4 +-
 src/gallium/drivers/llvmpipe/lp_setup_point.c   |4 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c |4 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |2 +-
 10 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_context.h 
b/src/gallium/drivers/llvmpipe/lp_context.h
index ab52001..9495e42 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -97,9 +97,9 @@ struct llvmpipe_context {
struct pipe_query_data_pipeline_statistics pipeline_statistics;
unsigned active_statistics_queries;
 
-   unsigned dirty; /** Mask of LP_NEW_x flags */
+   unsigned active_occlusion_queries;
 
-   unsigned active_occlusion_query;
+   unsigned dirty; /** Mask of LP_NEW_x flags */
 
/** Mapped vertex buffers */
ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c 
b/src/gallium/drivers/llvmpipe/lp_query.c
index 38d6b84..ac6d09d 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -233,7 +233,7 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
   break;
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
-  llvmpipe-active_occlusion_query++;
+  llvmpipe-active_occlusion_queries++;
   llvmpipe-dirty |= LP_NEW_OCCLUSION_QUERY;
   break;
default:
@@ -288,8 +288,8 @@ llvmpipe_end_query(struct pipe_context *pipe, struct 
pipe_query *q)
   break;
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
-  assert(llvmpipe-active_occlusion_query);
-  llvmpipe-active_occlusion_query--;
+  assert(llvmpipe-active_occlusion_queries);
+  llvmpipe-active_occlusion_queries--;
   llvmpipe-dirty |= LP_NEW_OCCLUSION_QUERY;
   break;
default:
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h 
b/src/gallium/drivers/llvmpipe/lp_rast.h
index 50917a7..e6e44c5 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -53,6 +53,8 @@ struct cmd_bin;
 /* Rasterizer output size going to jit fs, width/height */
 #define LP_RASTER_BLOCK_SIZE 4
 
+#define LP_MAX_ACTIVE_BINNED_QUERIES 16
+
 
 struct lp_rasterizer_task;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h 
b/src/gallium/drivers/llvmpipe/lp_scene.h
index 16f6969..59cce7d 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -130,7 +130,7 @@ struct lp_scene {
struct lp_fence *fence;
 
/* The queries still active at end of scene */
-   struct llvmpipe_query *active_queries[3];
+   struct llvmpipe_query *active_queries[LP_MAX_ACTIVE_BINNED_QUERIES];
unsigned num_active_queries;
 
/* Framebuffer mappings - valid only between begin_rasterization()
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index d2c5325..bb3a5c0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -155,22 +155,9 @@ lp_setup_rasterize_scene( struct lp_setup_context *setup )
struct lp_scene *scene = setup-scene;
struct llvmpipe_screen *screen = llvmpipe_screen(scene-pipe-screen);
 
-   scene-num_active_queries = 0;
-   if (setup-active_query[PIPE_QUERY_OCCLUSION_COUNTER]) {
-  scene-active_queries[scene-num_active_queries] =
- setup-active_query[PIPE_QUERY_OCCLUSION_COUNTER];
-  scene-num_active_queries++;
-   }
-   if (setup-active_query[PIPE_QUERY_OCCLUSION_PREDICATE]) {
-  scene-active_queries[scene-num_active_queries] =
- setup-active_query[PIPE_QUERY_OCCLUSION_PREDICATE];
-  scene-num_active_queries++;
-   }
-   if (setup-active_query[PIPE_QUERY_PIPELINE_STATISTICS]) {
-  scene-active_queries[scene-num_active_queries] =
- setup-active_query[PIPE_QUERY_PIPELINE_STATISTICS];
-  scene-num_active_queries++;
-   }
+   scene-num_active_queries = 

[Mesa-dev] [PATCH 2/2] softpipe: honor predication for clear_render_target and clear_depth_stencil

2013-06-25 Thread sroland
From: Roland Scheidegger srol...@vmware.com

trivial, copied from llvmpipe
---
 src/gallium/drivers/softpipe/sp_surface.c |   42 +++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_surface.c 
b/src/gallium/drivers/softpipe/sp_surface.c
index 52c85be..9e1523f 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -29,6 +29,7 @@
 #include util/u_surface.h
 #include sp_context.h
 #include sp_surface.h
+#include sp_query.h
 
 static void sp_blit(struct pipe_context *pipe,
 const struct pipe_blit_info *info)
@@ -82,11 +83,48 @@ static void sp_blit(struct pipe_context *pipe,
util_blitter_blit(sp-blitter, info);
 }
 
+static void
+softpipe_clear_render_target(struct pipe_context *pipe,
+ struct pipe_surface *dst,
+ const union pipe_color_union *color,
+ unsigned dstx, unsigned dsty,
+ unsigned width, unsigned height)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   if (!softpipe_check_render_cond(softpipe))
+  return;
+
+   util_clear_render_target(pipe, dst, color,
+dstx, dsty, width, height);
+}
+
+
+static void
+softpipe_clear_depth_stencil(struct pipe_context *pipe,
+ struct pipe_surface *dst,
+ unsigned clear_flags,
+ double depth,
+ unsigned stencil,
+ unsigned dstx, unsigned dsty,
+ unsigned width, unsigned height)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   if (!softpipe_check_render_cond(softpipe))
+  return;
+
+   util_clear_depth_stencil(pipe, dst, clear_flags,
+depth, stencil,
+dstx, dsty, width, height);
+}
+
+
 void
 sp_init_surface_functions(struct softpipe_context *sp)
 {
sp-pipe.resource_copy_region = util_resource_copy_region;
-   sp-pipe.clear_render_target = util_clear_render_target;
-   sp-pipe.clear_depth_stencil = util_clear_depth_stencil;
+   sp-pipe.clear_render_target = softpipe_clear_render_target;
+   sp-pipe.clear_depth_stencil = softpipe_clear_depth_stencil;
sp-pipe.blit = sp_blit;
 }
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: rework query logic

2013-06-24 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Previously lp_rast_begin_query commands were always inserted into each bin,
and re-issued if the scene was restarted, while lp_rast_end_query commands
were executed for each still active query at the end of tile rasterization.
Also, the ps_invocations and vis_counter were set to zero when the respective
command was encountered.
This however cannot work for multiple queries of the same type (note that
occlusion counter and occlusion predicate while different type were also
affected).
So, change the logic to always set the ps_invocations and vis_counter to zero
at the start of tile rasterization, and then use start and end per-thread
query values when encountering the begin/end query commands instead, which
should work for multiple queries of the same type. This also means queries do
not have to be reissued in a new scene, however they still need to be finished
at end of tile rasterization, so a list of queries still active at the end of
a scene needs to be maintained.
Also while here don't bin the queries which don't do anything in rasterization.
(This change does not actually handle multiple queries of the same type yet,
as the list of active queries is just a simple fixed array and setup can still
only have one query active per type.)
---
 src/gallium/drivers/llvmpipe/lp_query.c |   13 +++--
 src/gallium/drivers/llvmpipe/lp_query.h |3 +-
 src/gallium/drivers/llvmpipe/lp_rast.c  |   56 ++
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |9 ++-
 src/gallium/drivers/llvmpipe/lp_scene.h |4 ++
 src/gallium/drivers/llvmpipe/lp_setup.c |   81 +--
 src/gallium/drivers/llvmpipe/lp_setup_tri.c |5 ++
 7 files changed, 92 insertions(+), 79 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_query.c 
b/src/gallium/drivers/llvmpipe/lp_query.c
index 1d3edff..49abed0 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -120,19 +120,19 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
switch (pq-type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
   for (i = 0; i  num_threads; i++) {
- *result += pq-count[i];
+ *result += pq-end[i];
   }
   break;
case PIPE_QUERY_OCCLUSION_PREDICATE:
   for (i = 0; i  num_threads; i++) {
  /* safer (still not guaranteed) when there's an overflow */
- vresult-b = vresult-b || pq-count[i];
+ vresult-b = vresult-b || pq-end[i];
   }
   break;
case PIPE_QUERY_TIMESTAMP:
   for (i = 0; i  num_threads; i++) {
- if (pq-count[i]  *result) {
-*result = pq-count[i];
+ if (pq-end[i]  *result) {
+*result = pq-end[i];
  }
  if (*result == 0)
 *result = os_time_get_nano();
@@ -170,7 +170,7 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
  (struct pipe_query_data_pipeline_statistics *)vresult;
   /* only ps_invocations come from binned query */
   for (i = 0; i  num_threads; i++) {
- pq-stats.ps_invocations += pq-count[i];
+ pq-stats.ps_invocations += pq-end[i];
   }
   pq-stats.ps_invocations *= LP_RASTER_BLOCK_SIZE * LP_RASTER_BLOCK_SIZE;
   *stats = pq-stats;
@@ -200,7 +200,8 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
}
 
 
-   memset(pq-count, 0, sizeof(pq-count));
+   memset(pq-start, 0, sizeof(pq-start));
+   memset(pq-end, 0, sizeof(pq-end));
lp_setup_begin_query(llvmpipe-setup, pq);
 
switch (pq-type) {
diff --git a/src/gallium/drivers/llvmpipe/lp_query.h 
b/src/gallium/drivers/llvmpipe/lp_query.h
index e29022a..62ad5fd 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.h
+++ b/src/gallium/drivers/llvmpipe/lp_query.h
@@ -42,7 +42,8 @@ struct llvmpipe_context;
 
 
 struct llvmpipe_query {
-   uint64_t count[LP_MAX_THREADS];  /* a counter for each thread */
+   uint64_t start[LP_MAX_THREADS];  /* start count value for each thread */
+   uint64_t end[LP_MAX_THREADS];/* end count value for each thread */
struct lp_fence *fence;  /* fence from last scene this was binned 
in */
unsigned type;   /* PIPE_QUERY_* */
unsigned num_primitives_generated;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c 
b/src/gallium/drivers/llvmpipe/lp_rast.c
index 62a82e3..871cc50 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -61,7 +61,6 @@ static void
 lp_rast_begin( struct lp_rasterizer *rast,
struct lp_scene *scene )
 {
-
rast-curr_scene = scene;
 
LP_DBG(DEBUG_RAST, %s\n, __FUNCTION__);
@@ -100,6 +99,9 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
task-height = TILE_SIZE + y * TILE_SIZE  task-scene-fb.height ?
 task-scene-fb.height - y * TILE_SIZE : TILE_SIZE;
 
+   task-thread_data.vis_counter = 0;
+   task-ps_invocations = 0;
+
/* reset pointers to 

[Mesa-dev] [PATCH] llvmpipe: fix wrong results for queries not in a scene

2013-06-20 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The result isn't always 0 in this case (depends on query type),
so instead of special casing this just use the ordinary path (should result
in correct values thanks to initialization in query_begin/end), just
skipping the fence wait.
---
 src/gallium/drivers/llvmpipe/lp_query.c |   21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_query.c 
b/src/gallium/drivers/llvmpipe/lp_query.c
index 386639e..1d3edff 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -100,20 +100,17 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
uint64_t *result = (uint64_t *)vresult;
int i;
 
-   if (!pq-fence) {
-  /* no fence because there was no scene, so results is zero */
-  *result = 0;
-  return TRUE;
-   }
-
-   if (!lp_fence_signalled(pq-fence)) {
-  if (!lp_fence_issued(pq-fence))
- llvmpipe_flush(pipe, NULL, __FUNCTION__);
+   if (pq-fence) {
+  /* only have a fence if there was a scene */
+  if (!lp_fence_signalled(pq-fence)) {
+ if (!lp_fence_issued(pq-fence))
+llvmpipe_flush(pipe, NULL, __FUNCTION__);
 
-  if (!wait)
- return FALSE;
+ if (!wait)
+return FALSE;
 
-  lp_fence_wait(pq-fence);
+ lp_fence_wait(pq-fence);
+  }
}
 
/* Sum the results from each of the threads:
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] gallium: fix PIPE_QUERY_TIMESTAMP_DISJOINT

2013-06-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The semantics didn't really make sense, not really matching neither d3d9
(though the docs are all broken there) nor d3d10. So make it match d3d10
semantics, which actually gives meaning to the disjoint part.
Drivers are fixed up in a very primitive way, I have no idea what could
actually cause the counter to become unreliable so just always return
FALSE for the disjoint part.
---
 src/gallium/docs/source/context.rst   |   10 ++
 src/gallium/drivers/nv50/nv50_query.c |5 +++--
 src/gallium/drivers/nvc0/nvc0_query.c |4 +---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/gallium/docs/source/context.rst 
b/src/gallium/docs/source/context.rst
index ede89be..bfd58a4 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -330,11 +330,13 @@ scaled to nanoseconds, recorded after all commands issued 
prior to
 This query does not require a call to ``begin_query``.
 The result is an unsigned 64-bit integer.
 
-``PIPE_QUERY_TIMESTAMP_DISJOINT`` can be used to check whether the
-internal timer resolution is good enough to distinguish between the
-events at ``begin_query`` and ``end_query``.
+``PIPE_QUERY_TIMESTAMP_DISJOINT`` can be used to check the
+internal timer resolution and whether the timestamp counter has become
+unreliable due to things like throttling etc. - only if this is FALSE
+a timestamp query (within the timestamp_disjoint query) should be trusted.
 The result is a 64-bit integer specifying the timer resolution in Hz,
-followed by a boolean value indicating whether the timer has incremented.
+followed by a boolean value indicating whether the timestamp counter
+is discontinuous or disjoint.
 
 ``PIPE_QUERY_PRIMITIVES_GENERATED`` returns a 64-bit integer indicating
 the number of primitives processed by the pipeline (regardless of whether
diff --git a/src/gallium/drivers/nv50/nv50_query.c 
b/src/gallium/drivers/nv50/nv50_query.c
index 656ff9d..b97eff2 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -181,7 +181,6 @@ nv50_query_begin(struct pipe_context *pipe, struct 
pipe_query *pq)
   nv50_query_get(push, q, 0x20, 0x05805002);
   nv50_query_get(push, q, 0x30, 0x06805002);
   break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_TIME_ELAPSED:
   nv50_query_get(push, q, 0x10, 0x5002);
   break;
@@ -229,6 +228,8 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query 
*pq)
case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
   nv50_query_get(push, q, 0, 0x0d005002 | (q-index  5));
   break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+  break;
default:
   assert(0);
   break;
@@ -289,7 +290,7 @@ nv50_query_result(struct pipe_context *pipe, struct 
pipe_query *pq,
   break;
case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
   res64[0] = 10;
-  res8[8] = (data64[1] == data64[3]) ? FALSE : TRUE;
+  res8[8] = FALSE;
   break;
case PIPE_QUERY_TIME_ELAPSED:
   res64[0] = data64[1] - data64[3];
diff --git a/src/gallium/drivers/nvc0/nvc0_query.c 
b/src/gallium/drivers/nvc0/nvc0_query.c
index 8e584c9..3f5a9fb 100644
--- a/src/gallium/drivers/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nvc0/nvc0_query.c
@@ -285,7 +285,6 @@ nvc0_query_begin(struct pipe_context *pipe, struct 
pipe_query *pq)
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
   nvc0_query_get(push, q, 0x10, 0x03005002 | (q-index  5));
   break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_TIME_ELAPSED:
   nvc0_query_get(push, q, 0x10, 0x5002);
   break;
@@ -360,7 +359,6 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query 
*pq)
   nvc0_query_get(push, q, 0x20, 0x5002);
   break;
case PIPE_QUERY_TIMESTAMP:
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_TIME_ELAPSED:
   nvc0_query_get(push, q, 0, 0x5002);
   break;
@@ -478,7 +476,7 @@ nvc0_query_result(struct pipe_context *pipe, struct 
pipe_query *pq,
   break;
case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
   res64[0] = 10;
-  res8[8] = (data64[1] == data64[3]) ? FALSE : TRUE;
+  res8[8] = FALSE;
   break;
case PIPE_QUERY_TIME_ELAPSED:
   res64[0] = data64[1] - data64[3];
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] softpipe: handle all queries, and change for the new disjoint semantics

2013-06-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The driver can do render_condition but wasn't handling the occlusion
and so_overflow predicates (though the latter might not work yet due
to gs support).
---
 src/gallium/drivers/softpipe/sp_query.c |   39 ++-
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_query.c 
b/src/gallium/drivers/softpipe/sp_query.c
index b5bc0db..daeef53 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -60,8 +60,10 @@ softpipe_create_query(struct pipe_context *pipe,
struct softpipe_query* sq;
 
assert(type == PIPE_QUERY_OCCLUSION_COUNTER ||
+  type == PIPE_QUERY_OCCLUSION_PREDICATE ||
   type == PIPE_QUERY_TIME_ELAPSED ||
   type == PIPE_QUERY_SO_STATISTICS ||
+  type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
   type == PIPE_QUERY_PRIMITIVES_EMITTED ||
   type == PIPE_QUERY_PRIMITIVES_GENERATED || 
   type == PIPE_QUERY_PIPELINE_STATISTICS ||
@@ -90,9 +92,9 @@ softpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
 
switch (sq-type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
   sq-start = softpipe-occlusion_count;
   break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_TIME_ELAPSED:
   sq-start = os_time_get_nano();
   break;
@@ -102,6 +104,10 @@ softpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
   softpipe-num_primitives_generated = 0;
   sq-so.num_primitives_written = 0;
   softpipe-so_stats.num_primitives_written = 0;
+  break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+  sq-end = FALSE;
+  break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
   sq-so.num_primitives_written = 0;
   softpipe-so_stats.num_primitives_written = 0;
@@ -112,6 +118,7 @@ softpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
   break;
case PIPE_QUERY_TIMESTAMP:
case PIPE_QUERY_GPU_FINISHED:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
   break;
case PIPE_QUERY_PIPELINE_STATISTICS:
   /* reset our cache */
@@ -141,15 +148,19 @@ softpipe_end_query(struct pipe_context *pipe, struct 
pipe_query *q)
softpipe-active_query_count--;
switch (sq-type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
   sq-end = softpipe-occlusion_count;
   break;
case PIPE_QUERY_TIMESTAMP:
   sq-start = 0;
   /* fall through */
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_TIME_ELAPSED:
   sq-end = os_time_get_nano();
   break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+  sq-end = (softpipe-num_primitives_generated 
+ softpipe-so_stats.num_primitives_written);
+  break;
case PIPE_QUERY_SO_STATISTICS:
   sq-num_primitives_generated =
  softpipe-num_primitives_generated;
@@ -164,6 +175,7 @@ softpipe_end_query(struct pipe_context *pipe, struct 
pipe_query *q)
   sq-num_primitives_generated = softpipe-num_primitives_generated;
   break;
case PIPE_QUERY_GPU_FINISHED:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
   break;
case PIPE_QUERY_PIPELINE_STATISTICS:
   sq-stats.ia_vertices =
@@ -195,9 +207,9 @@ softpipe_end_query(struct pipe_context *pipe, struct 
pipe_query *q)
 
 static boolean
 softpipe_get_query_result(struct pipe_context *pipe, 
- struct pipe_query *q,
- boolean wait,
- union pipe_query_result *vresult)
+  struct pipe_query *q,
+  boolean wait,
+  union pipe_query_result *vresult)
 {
struct softpipe_query *sq = softpipe_query(q);
uint64_t *result = (uint64_t*)vresult;
@@ -215,15 +227,17 @@ softpipe_get_query_result(struct pipe_context *pipe,
  sizeof(struct pipe_query_data_pipeline_statistics));;
   break;
case PIPE_QUERY_GPU_FINISHED:
-  *result = TRUE;
+  vresult-b = TRUE;
+  break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+  vresult-b = sq-end != 0;
   break;
case PIPE_QUERY_TIMESTAMP_DISJOINT: {
-  struct pipe_query_data_timestamp_disjoint td;
+  struct pipe_query_data_timestamp_disjoint *td =
+  (struct pipe_query_data_timestamp_disjoint *)vresult;
   /* os_get_time_nano return nanoseconds */
-  td.frequency = UINT64_C(10);
-  td.disjoint = sq-end != sq-start;
-  memcpy(vresult, td,
- sizeof(struct pipe_query_data_timestamp_disjoint));
+  td-frequency = UINT64_C(10);
+  td-disjoint = FALSE;
}
   break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
@@ -232,6 +246,9 @@ softpipe_get_query_result(struct pipe_context *pipe,
case PIPE_QUERY_PRIMITIVES_GENERATED:
   *result = sq-num_primitives_generated;
   break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+  

[Mesa-dev] [PATCH] llvmpipe: handle more queries

2013-06-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Handle PIPE_QUERY_GPU_FINISHED and PIPE_QUERY_TIMESTAMP_DISJOINT, and
also fill out the ps_invocations and c_primitives from the
PIPE_QUERY_PIPELINE_STATISTICS (the others in there should already
be handled). Note that ps_invocations isn't pixel exact, just 16 pixel
exact but I guess it's better than nothing.
Doesn't really seem to work correctly but there's probably bugs elsewhere.
Also use a 64bit counter for occlusion queries.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c   |   11 ---
 src/gallium/drivers/llvmpipe/lp_jit.c |2 +-
 src/gallium/drivers/llvmpipe/lp_jit.h |2 +-
 src/gallium/drivers/llvmpipe/lp_query.c   |   23 ---
 src/gallium/drivers/llvmpipe/lp_rast.c|   19 ---
 src/gallium/drivers/llvmpipe/lp_rast_priv.h   |6 +-
 src/gallium/drivers/llvmpipe/lp_setup.c   |4 ++--
 src/gallium/drivers/llvmpipe/lp_setup_line.c  |   13 -
 src/gallium/drivers/llvmpipe/lp_setup_point.c |   10 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c   |8 
 10 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index edb59cc..79891cf 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -429,7 +429,7 @@ get_s_shift_and_mask(const struct util_format_description 
*format_desc,
  * Test the depth mask. Add the number of channel which has none zero mask
  * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
  * The counter will add 4.
- * TODO: could get that out of the loop, and need to use 64bit counter.
+ * TODO: could get that out of the fs loop.
  *
  * \param type holds element type of the mask vector.
  * \param maskvalue is the depth test mask.
@@ -458,6 +458,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
   LLVMInt32TypeInContext(context), bits);
   count = lp_build_intrinsic_unary(builder, popcntintr,
LLVMInt32TypeInContext(context), bits);
+  count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), 
);
}
else if(util_cpu_caps.has_avx  type.length == 8) {
   const char *movmskintr = llvm.x86.avx.movmsk.ps.256;
@@ -468,6 +469,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
   LLVMInt32TypeInContext(context), bits);
   count = lp_build_intrinsic_unary(builder, popcntintr,
LLVMInt32TypeInContext(context), bits);
+  count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), 
);
}
else {
   unsigned i;
@@ -510,8 +512,11 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
}
count = lp_build_intrinsic_unary(builder, popcntintr, counttype, 
countd);
 
-   if (type.length  4) {
-  count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 
32), );
+   if (type.length  8) {
+  count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 
64), );
+   }
+   else if (type.length  8) {
+  count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 
64), );
}
}
newcount = LLVMBuildLoad(builder, counter, origcount);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c 
b/src/gallium/drivers/llvmpipe/lp_jit.c
index f517b67..fa0f128 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -195,7 +195,7 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
   LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT];
   LLVMTypeRef thread_data_type;
 
-  elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt32TypeInContext(lc);
+  elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
 
   thread_data_type = LLVMStructTypeInContext(lc, elem_types,
  Elements(elem_types), 0);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h 
b/src/gallium/drivers/llvmpipe/lp_jit.h
index 2ecfde7..30cfaae 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -164,7 +164,7 @@ enum {
 
 struct lp_jit_thread_data
 {
-   uint32_t vis_counter;
+   uint64_t vis_counter;
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c 
b/src/gallium/drivers/llvmpipe/lp_query.c
index 922913d..7fbf5f7 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -40,6 +40,7 @@
 #include lp_query.h
 #include lp_screen.h
 #include lp_state.h
+#include lp_rast.h
 
 
 static struct llvmpipe_query *llvmpipe_query( struct pipe_query *p )
@@ -128,7 +129,7 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
case PIPE_QUERY_OCCLUSION_PREDICATE:
   for (i = 0; i  num_threads; i++) 

[Mesa-dev] [PATCH] gallium: add condition parameter to render_condition

2013-06-14 Thread sroland
From: Roland Scheidegger srol...@vmware.com

For conditional rendering this makes it possible to skip rendering
if either the predicate is true or false, as supported by d3d10
(in fact previously it was sort of implied skip rendering if predicate
is false for occlusion predicate, and true for so_overflow predicate).
There's no cap bit for this as presumably all drivers could do it trivially
(but this patch does not implement it for the drivers using true
hw predicates, nvxx, r600, radeonsi, no change is expected for OpenGL
functionality).
---
 src/gallium/auxiliary/cso_cache/cso_context.c |   13 ++---
 src/gallium/auxiliary/cso_cache/cso_context.h |3 ++-
 src/gallium/auxiliary/hud/hud_context.c   |2 +-
 src/gallium/auxiliary/postprocess/pp_run.c|2 +-
 src/gallium/auxiliary/util/u_blit.c   |2 +-
 src/gallium/auxiliary/util/u_blitter.c|3 ++-
 src/gallium/auxiliary/util/u_blitter.h|3 +++
 src/gallium/auxiliary/util/u_gen_mipmap.c |2 +-
 src/gallium/docs/source/context.rst   |   14 +-
 src/gallium/drivers/galahad/glhd_context.c|3 ++-
 src/gallium/drivers/ilo/ilo_3d.c  |4 +++-
 src/gallium/drivers/ilo/ilo_3d.h  |1 +
 src/gallium/drivers/llvmpipe/lp_context.c |2 ++
 src/gallium/drivers/llvmpipe/lp_context.h |1 +
 src/gallium/drivers/llvmpipe/lp_query.c   |3 ++-
 src/gallium/drivers/llvmpipe/lp_surface.c |2 +-
 src/gallium/drivers/nv30/nv30_context.h   |1 +
 src/gallium/drivers/nv30/nv30_miptree.c   |2 +-
 src/gallium/drivers/nv30/nv30_query.c |4 +++-
 src/gallium/drivers/nv50/nv50_context.h   |1 +
 src/gallium/drivers/nv50/nv50_query.c |4 +++-
 src/gallium/drivers/nv50/nv50_surface.c   |2 +-
 src/gallium/drivers/nvc0/nvc0_context.h   |1 +
 src/gallium/drivers/nvc0/nvc0_query.c |4 +++-
 src/gallium/drivers/nvc0/nvc0_surface.c   |2 +-
 src/gallium/drivers/r300/r300_query.c |7 ---
 src/gallium/drivers/r600/r600_blit.c  |1 +
 src/gallium/drivers/r600/r600_pipe.c  |6 --
 src/gallium/drivers/r600/r600_pipe.h  |1 +
 src/gallium/drivers/r600/r600_query.c |2 ++
 src/gallium/drivers/radeonsi/r600_blit.c  |4 +++-
 src/gallium/drivers/radeonsi/r600_query.c |4 +++-
 src/gallium/drivers/radeonsi/radeonsi_pipe.c  |6 --
 src/gallium/drivers/radeonsi/radeonsi_pipe.h  |2 ++
 src/gallium/drivers/softpipe/sp_context.c |2 ++
 src/gallium/drivers/softpipe/sp_context.h |1 +
 src/gallium/drivers/softpipe/sp_query.c   |2 +-
 src/gallium/drivers/softpipe/sp_surface.c |2 +-
 src/gallium/drivers/svga/svga_pipe_blit.c |2 +-
 src/gallium/drivers/trace/tr_context.c|4 +++-
 src/gallium/include/pipe/p_context.h  |2 ++
 src/mesa/state_tracker/st_cb_condrender.c |6 +++---
 42 files changed, 95 insertions(+), 40 deletions(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c 
b/src/gallium/auxiliary/cso_cache/cso_context.c
index b06a070..6805427 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -111,6 +111,7 @@ struct cso_context {
void *velements, *velements_saved;
struct pipe_query *render_condition, *render_condition_saved;
uint render_condition_mode, render_condition_mode_saved;
+   boolean render_condition_cond, render_condition_cond_saved;
 
struct pipe_clip_state clip;
struct pipe_clip_state clip_saved;
@@ -723,13 +724,17 @@ void cso_restore_stencil_ref(struct cso_context *ctx)
 }
 
 void cso_set_render_condition(struct cso_context *ctx,
-  struct pipe_query *query, uint mode)
+  struct pipe_query *query,
+  boolean condition, uint mode)
 {
struct pipe_context *pipe = ctx-pipe;
 
-   if (ctx-render_condition != query || ctx-render_condition_mode != mode) {
-  pipe-render_condition(pipe, query, mode);
+   if (ctx-render_condition != query ||
+   ctx-render_condition_mode != mode ||
+   ctx-render_condition_cond != condition) {
+  pipe-render_condition(pipe, query, condition, mode);
   ctx-render_condition = query;
+  ctx-render_condition_cond = condition;
   ctx-render_condition_mode = mode;
}
 }
@@ -737,12 +742,14 @@ void cso_set_render_condition(struct cso_context *ctx,
 void cso_save_render_condition(struct cso_context *ctx)
 {
ctx-render_condition_saved = ctx-render_condition;
+   ctx-render_condition_cond_saved = ctx-render_condition_cond;
ctx-render_condition_mode_saved = ctx-render_condition_mode;
 }
 
 void cso_restore_render_condition(struct cso_context *ctx)
 {
cso_set_render_condition(ctx, ctx-render_condition_saved,
+ctx-render_condition_cond_saved,

[Mesa-dev] [PATCH] llvmpipe: fixes for conditional rendering

2013-06-14 Thread sroland
From: Roland Scheidegger srol...@vmware.com

honor render_condition for clear_render_target and clear_depth_stencil.
Also add minimal support for occlusion predicate, though it can't be active
at the same time as an occlusion query yet.
While here also switchify some large if-else (actually just mutually
exclusive if-if-if...) constructs.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |4 ++
 src/gallium/drivers/llvmpipe/lp_query.c |   78 +++
 src/gallium/drivers/llvmpipe/lp_rast.c  |2 +
 src/gallium/drivers/llvmpipe/lp_surface.c   |   42 ++-
 4 files changed, 90 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index a8bd15f..40ab7be 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -429,6 +429,10 @@ get_s_shift_and_mask(const struct util_format_description 
*format_desc,
  * Test the depth mask. Add the number of channel which has none zero mask
  * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
  * The counter will add 4.
+ * TODO: would be much easier if we'd just have a nx32bit counter
+ * and simply sub the masks here. Then add the individual values
+ * at query end. This however wouldn't work with 64bit counter values
+ * which we should also do.
  *
  * \param type holds element type of the mask vector.
  * \param maskvalue is the depth test mask.
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c 
b/src/gallium/drivers/llvmpipe/lp_query.c
index 973c689..84910b8 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -125,6 +125,12 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
  *result += pq-count[i];
   }
   break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+  for (i = 0; i  num_threads; i++) {
+ /* safer (still not guaranteed) when there's an overflow */
+ *result = *result || pq-count[i];
+  }
+  break;
case PIPE_QUERY_TIMESTAMP:
   for (i = 0; i  num_threads; i++) {
  if (pq-count[i]  *result) {
@@ -181,30 +187,28 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
 
 
memset(pq-count, 0, sizeof(pq-count));
+   /* XXX do we really need to bin all queries */
lp_setup_begin_query(llvmpipe-setup, pq);
 
-   if (pq-type == PIPE_QUERY_PRIMITIVES_EMITTED) {
+   switch (pq-type) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
   pq-num_primitives_written = 0;
   llvmpipe-so_stats.num_primitives_written = 0;
-   }
-
-   if (pq-type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+  break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
   pq-num_primitives_generated = 0;
   llvmpipe-num_primitives_generated = 0;
-   }
-
-   if (pq-type == PIPE_QUERY_SO_STATISTICS) {
+  break;
+   case PIPE_QUERY_SO_STATISTICS:
   pq-num_primitives_written = 0;
   llvmpipe-so_stats.num_primitives_written = 0;
   pq-num_primitives_generated = 0;
   llvmpipe-num_primitives_generated = 0;
-   }
-
-   if (pq-type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) {
+  break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
   pq-so_has_overflown = FALSE;
-   }
-
-   if (pq-type == PIPE_QUERY_PIPELINE_STATISTICS) {
+  break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
   /* reset our cache */
   if (llvmpipe-active_statistics_queries == 0) {
  memset(llvmpipe-pipeline_statistics, 0,
@@ -212,11 +216,16 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct 
pipe_query *q)
   }
   memcpy(pq-stats, llvmpipe-pipeline_statistics, sizeof(pq-stats));
   llvmpipe-active_statistics_queries++;
-   }
-
-   if (pq-type == PIPE_QUERY_OCCLUSION_COUNTER) {
-  llvmpipe-active_occlusion_query = TRUE;
+  break;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+  /* Both active at same time will still fail all over the place.
+   * Then again several of each type can be active too... */
+  llvmpipe-active_occlusion_query++;
   llvmpipe-dirty |= LP_NEW_OCCLUSION_QUERY;
+  break;
+   default:
+  break;
}
 }
 
@@ -229,25 +238,23 @@ llvmpipe_end_query(struct pipe_context *pipe, struct 
pipe_query *q)
 
lp_setup_end_query(llvmpipe-setup, pq);
 
-   if (pq-type == PIPE_QUERY_PRIMITIVES_EMITTED) {
-  pq-num_primitives_written = llvmpipe-so_stats.num_primitives_written;
-   }
+   switch (pq-type) {
 
-   if (pq-type == PIPE_QUERY_PRIMITIVES_GENERATED) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+  pq-num_primitives_written = llvmpipe-so_stats.num_primitives_written;
+  break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
   pq-num_primitives_generated = llvmpipe-num_primitives_generated;
-   }
-
-   if (pq-type == PIPE_QUERY_SO_STATISTICS) {
+  break;
+   case PIPE_QUERY_SO_STATISTICS:
   pq-num_primitives_written = llvmpipe-so_stats.num_primitives_written;
   

[Mesa-dev] [PATCH] util: new util_fill_box helper

2013-06-08 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Use new util_fill_box helper for util_clear_render_target.
(Also fix off-by-one map error.)
---
 src/gallium/auxiliary/util/u_surface.c |   39 +--
 src/gallium/auxiliary/util/u_surface.h |7 +
 src/gallium/drivers/llvmpipe/lp_rast.c |   54 +++-
 3 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_surface.c 
b/src/gallium/auxiliary/util/u_surface.c
index 77d04ba..8411715 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -214,6 +214,29 @@ util_fill_rect(ubyte * dst,
 }
 
 
+void
+util_fill_box(ubyte * dst,
+  enum pipe_format format,
+  unsigned stride,
+  unsigned layer_stride,
+  unsigned x,
+  unsigned y,
+  unsigned z,
+  unsigned width,
+  unsigned height,
+  unsigned depth,
+  union util_color *uc)
+{
+   unsigned layer;
+   for (layer = 0; layer  depth; layer++) {
+  util_fill_rect(dst, format,
+ stride,
+ x, y, width, height, uc);
+  dst += layer_stride;
+   }
+}
+
+
 /**
  * Fallback function for pipe-resource_copy_region().
  * Note: (X,Y)=(0,0) is always the upper-left corner.
@@ -319,7 +342,7 @@ util_clear_render_target(struct pipe_context *pipe,
struct pipe_transfer *dst_trans;
ubyte *dst_map;
union util_color uc;
-   unsigned max_layer, layer;
+   unsigned max_layer;
 
assert(dst-texture);
if (!dst-texture)
@@ -349,7 +372,7 @@ util_clear_render_target(struct pipe_context *pipe,
  dst-u.tex.level,
  PIPE_TRANSFER_WRITE,
  dstx, dsty, dst-u.tex.first_layer,
- width, height, max_layer, dst_trans);
+ width, height, max_layer + 1, dst_trans);
}
 
assert(dst_map);
@@ -376,12 +399,9 @@ util_clear_render_target(struct pipe_context *pipe,
  util_pack_color(color-f, dst-format, uc);
   }
 
-  for (layer = 0; layer = max_layer; layer++) {
- util_fill_rect(dst_map, dst-format,
-dst_trans-stride,
-0, 0, width, height, uc);
- dst_map += dst_trans-layer_stride;
-  }
+  util_fill_box(dst_map, dst-format,
+dst_trans-stride, dst_trans-layer_stride,
+0, 0, 0, width, height, max_layer + 1, uc);
 
   pipe-transfer_unmap(pipe, dst_trans);
}
@@ -430,8 +450,7 @@ util_clear_depth_stencil(struct pipe_context *pipe,
 
if (dst_map) {
   unsigned dst_stride = dst_trans-stride;
-  uint64_t zstencil = util_pack64_z_stencil(format,
-depth, stencil);
+  uint64_t zstencil = util_pack64_z_stencil(format, depth, stencil);
   ubyte *dst_layer = dst_map;
   unsigned i, j;
   assert(dst_trans-stride  0);
diff --git a/src/gallium/auxiliary/util/u_surface.h 
b/src/gallium/auxiliary/util/u_surface.h
index d6184ac..bfd8f40 100644
--- a/src/gallium/auxiliary/util/u_surface.h
+++ b/src/gallium/auxiliary/util/u_surface.h
@@ -65,6 +65,13 @@ util_fill_rect(ubyte * dst, enum pipe_format format,
unsigned dst_stride, unsigned dst_x, unsigned dst_y,
unsigned width, unsigned height, union util_color *uc);
 
+extern void
+util_fill_box(ubyte * dst, enum pipe_format format,
+  unsigned stride, unsigned layer_stride,
+  unsigned x, unsigned y, unsigned z,
+  unsigned width, unsigned height, unsigned depth,
+  union util_color *uc);
+
 
 extern void
 util_resource_copy_region(struct pipe_context *pipe,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c 
b/src/gallium/drivers/llvmpipe/lp_rast.c
index dcd66ab..79d4c58 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -135,8 +135,6 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
 
  for (i = 0; i  scene-fb.nr_cbufs; i++) {
 enum pipe_format format = scene-fb.cbufs[i]-format;
-unsigned layer;
-uint8_t *map_layer = scene-cbufs[i].map;
 
 if (util_format_is_pure_sint(format)) {
util_format_write_4i(format, arg.clear_color.i, 0, uc, 0, 0, 
0, 1, 1);
@@ -146,17 +144,17 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
util_format_write_4ui(format, arg.clear_color.ui, 0, uc, 0, 0, 
0, 1, 1);
 }
 
-for (layer = 0; layer = scene-fb_max_layer; layer++) {
-   util_fill_rect(map_layer,
-  scene-fb.cbufs[i]-format,
-  scene-cbufs[i].stride,
-  task-x,
-  task-y,
- 

[Mesa-dev] [PATCH] gallium/docs: fix up transfer description for 1d arrays, add cube map arrays

2013-06-06 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Transfers always use z/depth for layers no matter if it's a 1d or 2d array
texture, we don't follow OpenGL's crazyness there. Luckily this appears to
only be a doc bug, everyone doing the right thing already.
While here also document z/depth parameter for cube map arrays.
---
 src/gallium/docs/source/context.rst |9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/gallium/docs/source/context.rst 
b/src/gallium/docs/source/context.rst
index 679772f..cfcd267 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -472,15 +472,14 @@ The box parameter to some of these functions defines a 
1D, 2D or 3D
 region of pixels.  This is self-explanatory for 1D, 2D and 3D texture
 targets.
 
-For PIPE_TEXTURE_1D_ARRAY, the box::y and box::height fields refer to the
-array dimension of the texture.
-
-For PIPE_TEXTURE_2D_ARRAY, the box::z and box::depth fields refer to the
-array dimension of the texture.
+For PIPE_TEXTURE_1D_ARRAY nad PIPE_TEXTURE_2D_ARRAY, the box::z and box::depth
+fields refer to the array dimension of the texture.
 
 For PIPE_TEXTURE_CUBE, the box:z and box::depth fields refer to the
 faces of the cube map (z + depth = 6).
 
+For PIPE_TEXTURE_CUBE_ARRAY, the box:z and box::depth fields refer to the
+both face and array dimension of the texture (face = z % 6, array = z / 6).
 
 
 .. _transfer_flush_region:
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] llvmpipe: move create_surface/destroy_surface functions to lp_surface.c

2013-06-06 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Believe it or not but these two are actually the first two functions which
really belong in this file nowadays.
---
 src/gallium/drivers/llvmpipe/lp_surface.c |   59 -
 src/gallium/drivers/llvmpipe/lp_texture.c |   59 +
 2 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c 
b/src/gallium/drivers/llvmpipe/lp_surface.c
index 2019d52..a40fe1b 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -232,11 +232,68 @@ static void lp_blit(struct pipe_context *pipe,
 }
 
 
+static struct pipe_surface *
+llvmpipe_create_surface(struct pipe_context *pipe,
+struct pipe_resource *pt,
+const struct pipe_surface *surf_tmpl)
+{
+   struct pipe_surface *ps;
+
+   if (!(pt-bind  (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET)))
+  debug_printf(Illegal surface creation without bind flag\n);
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+  pipe_reference_init(ps-reference, 1);
+  pipe_resource_reference(ps-texture, pt);
+  ps-context = pipe;
+  ps-format = surf_tmpl-format;
+  if (llvmpipe_resource_is_texture(pt)) {
+ assert(surf_tmpl-u.tex.level = pt-last_level);
+ assert(surf_tmpl-u.tex.first_layer = surf_tmpl-u.tex.last_layer);
+ ps-width = u_minify(pt-width0, surf_tmpl-u.tex.level);
+ ps-height = u_minify(pt-height0, surf_tmpl-u.tex.level);
+ ps-u.tex.level = surf_tmpl-u.tex.level;
+ ps-u.tex.first_layer = surf_tmpl-u.tex.first_layer;
+ ps-u.tex.last_layer = surf_tmpl-u.tex.last_layer;
+  }
+  else {
+ /* setting width as number of elements should get us correct 
renderbuffer width */
+ ps-width = surf_tmpl-u.buf.last_element - 
surf_tmpl-u.buf.first_element + 1;
+ ps-height = pt-height0;
+ ps-u.buf.first_element = surf_tmpl-u.buf.first_element;
+ ps-u.buf.last_element = surf_tmpl-u.buf.last_element;
+ assert(ps-u.buf.first_element = ps-u.buf.last_element);
+ assert(util_format_get_blocksize(surf_tmpl-format) *
+(ps-u.buf.last_element + 1) = pt-width0);
+  }
+   }
+   return ps;
+}
+
+
+static void
+llvmpipe_surface_destroy(struct pipe_context *pipe,
+ struct pipe_surface *surf)
+{
+   /* Effectively do the texture_update work here - if texture images
+* needed post-processing to put them into hardware layout, this is
+* where it would happen.  For llvmpipe, nothing to do.
+*/
+   assert(surf-texture);
+   pipe_resource_reference(surf-texture, NULL);
+   FREE(surf);
+}
+
+
 void
 llvmpipe_init_surface_functions(struct llvmpipe_context *lp)
 {
-   lp-pipe.resource_copy_region = lp_resource_copy;
lp-pipe.clear_render_target = util_clear_render_target;
lp-pipe.clear_depth_stencil = util_clear_depth_stencil;
+   lp-pipe.create_surface = llvmpipe_create_surface;
+   lp-pipe.surface_destroy = llvmpipe_surface_destroy;
+   /* These two are not actually functions dealing with surfaces */
+   lp-pipe.resource_copy_region = lp_resource_copy;
lp-pipe.blit = lp_blit;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 2263e0a..0088b6a 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -478,60 +478,6 @@ llvmpipe_resource_get_handle(struct pipe_screen *screen,
 }
 
 
-static struct pipe_surface *
-llvmpipe_create_surface(struct pipe_context *pipe,
-struct pipe_resource *pt,
-const struct pipe_surface *surf_tmpl)
-{
-   struct pipe_surface *ps;
-
-   if (!(pt-bind  (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET)))
-  debug_printf(Illegal surface creation without bind flag\n);
-
-   ps = CALLOC_STRUCT(pipe_surface);
-   if (ps) {
-  pipe_reference_init(ps-reference, 1);
-  pipe_resource_reference(ps-texture, pt);
-  ps-context = pipe;
-  ps-format = surf_tmpl-format;
-  if (llvmpipe_resource_is_texture(pt)) {
- assert(surf_tmpl-u.tex.level = pt-last_level);
- assert(surf_tmpl-u.tex.first_layer = surf_tmpl-u.tex.last_layer);
- ps-width = u_minify(pt-width0, surf_tmpl-u.tex.level);
- ps-height = u_minify(pt-height0, surf_tmpl-u.tex.level);
- ps-u.tex.level = surf_tmpl-u.tex.level;
- ps-u.tex.first_layer = surf_tmpl-u.tex.first_layer;
- ps-u.tex.last_layer = surf_tmpl-u.tex.last_layer;
-  }
-  else {
- /* setting width as number of elements should get us correct 
renderbuffer width */
- ps-width = surf_tmpl-u.buf.last_element - 
surf_tmpl-u.buf.first_element + 1;
- ps-height = pt-height0;
- ps-u.buf.first_element = surf_tmpl-u.buf.first_element;
- ps-u.buf.last_element = 

[Mesa-dev] [PATCH 2/2] util: fix util_clear_render_target and util_clear_depth_stencil layer handling

2013-06-06 Thread sroland
From: Roland Scheidegger srol...@vmware.com

These functions must clear all bound layers, not just the first.
---
 src/gallium/auxiliary/util/u_surface.c  |  190 +--
 src/gallium/auxiliary/util/u_transfer.c |1 +
 2 files changed, 104 insertions(+), 87 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_surface.c 
b/src/gallium/auxiliary/util/u_surface.c
index 5c3a655..77d04ba 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -307,6 +307,7 @@ no_src_map:
  * cpp  4 looks like a gross hack at best...
  * Plus can't use these transfer fallbacks when clearing
  * multisampled surfaces for instance.
+ * Clears all bound layers.
  */
 void
 util_clear_render_target(struct pipe_context *pipe,
@@ -316,8 +317,9 @@ util_clear_render_target(struct pipe_context *pipe,
  unsigned width, unsigned height)
 {
struct pipe_transfer *dst_trans;
-   void *dst_map;
+   ubyte *dst_map;
union util_color uc;
+   unsigned max_layer, layer;
 
assert(dst-texture);
if (!dst-texture)
@@ -332,6 +334,7 @@ util_clear_render_target(struct pipe_context *pipe,
   unsigned pixstride = util_format_get_blocksize(dst-format);
   dx = (dst-u.buf.first_element + dstx) * pixstride;
   w = width * pixstride;
+  max_layer = 0;
   dst_map = pipe_transfer_map(pipe,
   dst-texture,
   0, 0,
@@ -340,14 +343,13 @@ util_clear_render_target(struct pipe_context *pipe,
   dst_trans);
}
else {
-  /* XXX: should handle multiple layers */
-  dst_map = pipe_transfer_map(pipe,
-  dst-texture,
-  dst-u.tex.level,
-  dst-u.tex.first_layer,
-  PIPE_TRANSFER_WRITE,
-  dstx, dsty, width, height, dst_trans);
-
+  max_layer = dst-u.tex.last_layer - dst-u.tex.first_layer;
+  dst_map = pipe_transfer_map_3d(pipe,
+ dst-texture,
+ dst-u.tex.level,
+ PIPE_TRANSFER_WRITE,
+ dstx, dsty, dst-u.tex.first_layer,
+ width, height, max_layer, dst_trans);
}
 
assert(dst_map);
@@ -373,9 +375,13 @@ util_clear_render_target(struct pipe_context *pipe,
   else {
  util_pack_color(color-f, dst-format, uc);
   }
-  util_fill_rect(dst_map, dst-format,
- dst_trans-stride,
- 0, 0, width, height, uc);
+
+  for (layer = 0; layer = max_layer; layer++) {
+ util_fill_rect(dst_map, dst-format,
+dst_trans-stride,
+0, 0, width, height, uc);
+ dst_map += dst_trans-layer_stride;
+  }
 
   pipe-transfer_unmap(pipe, dst_trans);
}
@@ -386,6 +392,7 @@ util_clear_render_target(struct pipe_context *pipe,
  * sw fallback doesn't look terribly useful here.
  * Plus can't use these transfer fallbacks when clearing
  * multisampled surfaces for instance.
+ * Clears all bound layers.
  */
 void
 util_clear_depth_stencil(struct pipe_context *pipe,
@@ -400,6 +407,7 @@ util_clear_depth_stencil(struct pipe_context *pipe,
struct pipe_transfer *dst_trans;
ubyte *dst_map;
boolean need_rmw = FALSE;
+   unsigned max_layer, layer;
 
if ((clear_flags  PIPE_CLEAR_DEPTHSTENCIL) 
((clear_flags  PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) 
@@ -409,102 +417,110 @@ util_clear_depth_stencil(struct pipe_context *pipe,
assert(dst-texture);
if (!dst-texture)
   return;
-   dst_map = pipe_transfer_map(pipe,
-   dst-texture,
-   dst-u.tex.level,
-   dst-u.tex.first_layer,
-   (need_rmw ? PIPE_TRANSFER_READ_WRITE :
-   PIPE_TRANSFER_WRITE),
-   dstx, dsty, width, height, dst_trans);
+
+   max_layer = dst-u.tex.last_layer - dst-u.tex.first_layer;
+   dst_map = pipe_transfer_map_3d(pipe,
+  dst-texture,
+  dst-u.tex.level,
+  (need_rmw ? PIPE_TRANSFER_READ_WRITE :
+  PIPE_TRANSFER_WRITE),
+  dstx, dsty, dst-u.tex.first_layer,
+  width, height, max_layer + 1, dst_trans);
assert(dst_map);
 
if (dst_map) {
   unsigned dst_stride = dst_trans-stride;
   uint64_t zstencil = util_pack64_z_stencil(format,
 depth, stencil);
+  ubyte *dst_layer = dst_map;
   unsigned i, j;
   assert(dst_trans-stride  0);
 
-  

[Mesa-dev] [PATCH 1/3] gallium/tgsi: add missing string for layer semantic

2013-06-05 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Also report if a shader writes the layer semantic
---
 src/gallium/auxiliary/draw/draw_context.c |2 +-
 src/gallium/auxiliary/tgsi/tgsi_scan.c|5 +
 src/gallium/auxiliary/tgsi/tgsi_scan.h|1 +
 src/gallium/auxiliary/tgsi/tgsi_strings.c |1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index 58ce270..35063b9 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -548,7 +548,7 @@ draw_get_shader_info(const struct draw_context *draw)
  * function to find those attributes.
  *
  * -1 is returned if the attribute is not found since this is
- * an undefined situtation. Note, that zero is valid and can
+ * an undefined situation. Note, that zero is valid and can
  * be used by any of the attributes, because position is not
  * required to be attribute 0 or even at all present.
  */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c 
b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 0230267..d331257 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -217,6 +217,11 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
   TGSI_SEMANTIC_VIEWPORT_INDEX) {
  info-writes_viewport_index = TRUE;
   }
+  if (procType == TGSI_PROCESSOR_GEOMETRY 
+  fulldecl-Semantic.Name ==
+  TGSI_SEMANTIC_LAYER) {
+ info-writes_layer = TRUE;
+  }
}
 
  }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h 
b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 676abf0..a5b7024 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -76,6 +76,7 @@ struct tgsi_shader_info
boolean pixel_center_integer;
boolean color0_writes_all_cbufs;
boolean writes_viewport_index;
+   boolean writes_layer;
 
unsigned num_written_clipdistance;
/**
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c 
b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 6abf927..625107c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -80,6 +80,7 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
TEXCOORD,
PCOORD,
VIEWPORT_INDEX
+   LAYER
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] llvmpipe: add support for layered rendering

2013-06-05 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Mostly just make sure the layer parameter gets passed through to the right
places (and get clamped, can do this at setup time), fix up clears to
clear all layers and disable opaque optimization. Luckily don't need to
touch the jitted code.
(Clears invoked via pipe's clear_render_target method will not work however
since the pipe_util_clear function used for it doesn't handle clearing
multiple layers yet.)
---
 src/gallium/drivers/llvmpipe/lp_context.h   |3 +
 src/gallium/drivers/llvmpipe/lp_jit.h   |2 +-
 src/gallium/drivers/llvmpipe/lp_rast.c  |  195 ---
 src/gallium/drivers/llvmpipe/lp_rast.h  |2 +-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |   20 ++-
 src/gallium/drivers/llvmpipe/lp_scene.c |   12 +-
 src/gallium/drivers/llvmpipe/lp_scene.h |7 +-
 src/gallium/drivers/llvmpipe/lp_setup.c |1 +
 src/gallium/drivers/llvmpipe/lp_setup_context.h |1 +
 src/gallium/drivers/llvmpipe/lp_setup_line.c|6 +
 src/gallium/drivers/llvmpipe/lp_setup_point.c   |7 +
 src/gallium/drivers/llvmpipe/lp_setup_tri.c |   17 +-
 src/gallium/drivers/llvmpipe/lp_state_derived.c |   13 +-
 src/gallium/drivers/llvmpipe/lp_texture.c   |3 -
 src/gallium/drivers/llvmpipe/lp_texture.h   |   10 ++
 15 files changed, 190 insertions(+), 109 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_context.h 
b/src/gallium/drivers/llvmpipe/lp_context.h
index 54f3830..abfe852 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -119,6 +119,9 @@ struct llvmpipe_context {
/** Which vertex shader output slot contains viewport index */
int viewport_index_slot;
 
+   /** Which geometry shader output slot contains layer */
+   int layer_slot;
+
/** minimum resolvable depth value, for polygon offset */   
double mrd;

diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h 
b/src/gallium/drivers/llvmpipe/lp_jit.h
index 4e9ca76..2ecfde7 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -204,7 +204,7 @@ typedef void
 const void *dadx,
 const void *dady,
 uint8_t **color,
-void *depth,
+uint8_t *depth,
 uint32_t mask,
 struct lp_jit_thread_data *thread_data,
 unsigned *stride,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c 
b/src/gallium/drivers/llvmpipe/lp_rast.c
index 981dd71..aa5224e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -134,6 +134,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
 
  for (i = 0; i  scene-fb.nr_cbufs; i++) {
 enum pipe_format format = scene-fb.cbufs[i]-format;
+unsigned layer;
+uint8_t *map_layer = scene-cbufs[i].map;
 
 if (util_format_is_pure_sint(format)) {
util_format_write_4i(format, arg.clear_color.i, 0, uc, 0, 0, 
0, 1, 1);
@@ -143,14 +145,17 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
util_format_write_4ui(format, arg.clear_color.ui, 0, uc, 0, 0, 
0, 1, 1);
 }
 
-util_fill_rect(scene-cbufs[i].map,
-   scene-fb.cbufs[i]-format,
-   scene-cbufs[i].stride,
-   task-x,
-   task-y,
-   task-width,
-   task-height,
-   uc);
+for (layer = 0; layer = scene-fb_max_layer; layer++) {
+   util_fill_rect(map_layer,
+  scene-fb.cbufs[i]-format,
+  scene-cbufs[i].stride,
+  task-x,
+  task-y,
+  task-width,
+  task-height,
+  uc);
+   map_layer += scene-cbufs[i].layer_stride;
+}
  }
   }
   else {
@@ -167,18 +172,21 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
 clear_color[3]);
 
  for (i = 0; i  scene-fb.nr_cbufs; i++) {
-
-util_pack_color(arg.clear_color.f,
-scene-fb.cbufs[i]-format, uc);
-
-util_fill_rect(scene-cbufs[i].map,
-   scene-fb.cbufs[i]-format,
-   scene-cbufs[i].stride,
-   task-x,
-   task-y,
-   task-width,
-   task-height,
-   uc);
+unsigned layer;
+uint8_t *map_layer = scene-cbufs[i].map;
+for (layer = 0; layer = scene-fb_max_layer; layer++) {
+   

[Mesa-dev] [PATCH 3/3] llvmpipe: bump 3d and cube map limits to 2048 and 8192 respectively

2013-06-05 Thread sroland
From: Roland Scheidegger srol...@vmware.com

These should just work (?), required by d3d10. Too large resources will
get thrown out separately anyway.
---
 src/gallium/drivers/llvmpipe/lp_limits.h |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h 
b/src/gallium/drivers/llvmpipe/lp_limits.h
index c7905b8..af31b35 100644
--- a/src/gallium/drivers/llvmpipe/lp_limits.h
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -45,8 +45,8 @@
  */
 #define LP_MAX_TEXTURE_SIZE (1 * 1024 * 1024 * 1024ULL)  /* 1GB for now */
 #define LP_MAX_TEXTURE_2D_LEVELS 14  /* 8K x 8K for now */
-#define LP_MAX_TEXTURE_3D_LEVELS 11  /* 1K x 1K x 1K for now */
-#define LP_MAX_TEXTURE_CUBE_LEVELS 13  /* 4K x 4K for now */
+#define LP_MAX_TEXTURE_3D_LEVELS 12  /* 2K x 2K x 2K for now */
+#define LP_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
 #define LP_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */
 
 
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: work around slow code generated for interleaving 128bit vectors

2013-06-04 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We use 128bit vector interleave for untwiddling in the blend code (with
256bit vectors). llvm generates terrible code for this for some reason,
so instead of generating a shuffle for 2 128bit vectors use a
extract/insert shuffle instead (it only seems to matter we're not using
128bit wide vectors for the shuffle). This decreases instruction count of
the blend code generated for a rgba8 render target without blending from
169 to 113 with llvm 3.1 and from 136 to 114 in llvm 3.2/3.3, and I got
a ~8% (llvm 3.1) and ~5% (3.2/3.3) performance improvement in gears.
(The generated code is still not terribly good as we could actually avoid
the interleaving completely but llvm can't know this.)
---
 src/gallium/auxiliary/gallivm/lp_bld_pack.c |   22 ++
 1 file changed, 22 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c 
b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 14fcd38..f660165 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -271,6 +271,28 @@ lp_build_interleave2(struct gallivm_state *gallivm,
 {
LLVMValueRef shuffle;
 
+   if (type.length == 2  type.width == 128  util_cpu_caps.has_avx) {
+  /*
+   * This is a workaround for llvm code generation deficiency. Strangely
+   * enough, while this needs  vinsertf128/vextractf128 instructions (hence
+   * a natural match when using 2x128bit vectors) the normal unpack 
shuffle
+   * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 
3.2, 3.3).
+   * So use some different shuffles instead (the exact shuffles don't seem 
to
+   * matter, as long as not using 128bit wide vectors, works with 8x32 or 
4x64).
+   */
+  struct lp_type tmp_type = type;
+  LLVMValueRef srchalf[2], tmpdst;
+  tmp_type.length = 4;
+  tmp_type.width = 64;
+  a = LLVMBuildBitCast(gallivm-builder, a, lp_build_vec_type(gallivm, 
tmp_type), );
+  b = LLVMBuildBitCast(gallivm-builder, b, lp_build_vec_type(gallivm, 
tmp_type), );
+  srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
+  srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
+  tmp_type.length = 2;
+  tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
+  return LLVMBuildBitCast(gallivm-builder, tmpdst, 
lp_build_vec_type(gallivm, type), );
+
+   }
shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
 
return LLVMBuildShuffleVector(gallivm-builder, a, b, shuffle, );
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] gallivm: (trivial) fix lp_build_concat_n

2013-06-03 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The code was designed to handle no-op concat but failed (unless the
caller was using same pointer for src and dst).
---
 src/gallium/auxiliary/gallivm/lp_bld_pack.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c 
b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 0a57e39..14fcd38 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -241,8 +241,12 @@ lp_build_concat_n(struct gallivm_state *gallivm,
assert(num_srcs = num_dsts);
assert((num_srcs % size) == 0);
 
-   if (num_srcs == num_dsts)
+   if (num_srcs == num_dsts) {
+  for (i = 0; i  num_dsts; ++i) {
+ dst[i] = src[i];
+  }
   return 1;
+   }
 
for (i = 0; i  num_dsts; ++i) {
   dst[i] = lp_build_concat(gallivm, src[i * size], src_type, size);
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] gallivm: enhance special sse2 4x4f and 2x8f - 1x16ub conversion

2013-06-03 Thread sroland
From: Roland Scheidegger srol...@vmware.com

There's no good reason why it can't handle 2x4f-1x8ub, 1x4f-1x4ub and
1x8f-1x8ub cases, there might be legitimate reasons why we don't have
enough input vectors for a full destination vector, and using pack
intrinsics should still be much better than using generic conversion
(it looks like convert_alpha from the blend code might hit this though
I suspect it could be avoided).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |   90 +--
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index eb2d096..f11361a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -530,24 +530,22 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
dst_type-width== 8)
{
   /* Special case 4x4f -- 1x16ub */
-  if (src_type.length == 4  util_cpu_caps.has_sse2)
+  if (src_type.length == 4 
+  util_cpu_caps.has_sse2)
   {
- assert((num_srcs % 4) == 0);
-
- num_dsts = num_srcs / 4;
- dst_type-length = 16;
+ num_dsts = (num_srcs + 3) / 4;
+ dst_type-length = num_srcs * 4 = 16 ? 16 : num_srcs * 4;
 
  lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, 
num_dsts);
  return num_dsts;
   }
 
   /* Special case 2x8f -- 1x16ub */
-  if (src_type.length == 8  util_cpu_caps.has_avx)
+  if (src_type.length == 8 
+  util_cpu_caps.has_avx)
   {
- assert((num_srcs % 2) == 0);
-
- num_dsts = num_srcs / 2;
- dst_type-length = 16;
+ num_dsts = (num_srcs + 1) / 2;
+ dst_type-length = num_srcs * 8 = 16 ? 16 : num_srcs * 8;
 
  lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, 
num_dsts);
  return num_dsts;
@@ -602,7 +600,7 @@ lp_build_conv(struct gallivm_state *gallivm,
num_tmps = num_srcs;
 
 
-   /* Special case 4x4f -- 1x16ub 
+   /* Special case 4x4f -- 1x16ub, 2x4f - 1x8ub, 1x4f - 1x4ub
 */
if (src_type.floating == 1 
src_type.fixed== 0 
@@ -616,20 +614,23 @@ lp_build_conv(struct gallivm_state *gallivm,
dst_type.sign == 0 
dst_type.norm == 1 
dst_type.width== 8 
-   dst_type.length   == 16 
 
-   4 * num_dsts  == num_srcs 
+   ((dst_type.length == 16  4 * num_dsts == num_srcs) ||
+(num_dsts == 1  dst_type.length * num_srcs == 16  num_srcs != 3)) 

 
util_cpu_caps.has_sse2)
{
   struct lp_build_context bld;
-  struct lp_type int16_type = dst_type;
-  struct lp_type int32_type = dst_type;
+  struct lp_type int16_type, int32_type;
+  struct lp_type dst_type_ext = dst_type;
   LLVMValueRef const_255f;
   unsigned i, j;
 
   lp_build_context_init(bld, gallivm, src_type);
 
+  dst_type_ext.length = 16;
+  int16_type = int32_type = dst_type_ext;
+
   int16_type.width *= 2;
   int16_type.length /= 2;
   int16_type.sign = 1;
@@ -643,21 +644,34 @@ lp_build_conv(struct gallivm_state *gallivm,
   for (i = 0; i  num_dsts; ++i, src += 4) {
  LLVMValueRef lo, hi;
 
- for (j = 0; j  4; ++j) {
+ for (j = 0; j  dst_type.length / 4; ++j) {
 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, );
 tmp[j] = lp_build_iround(bld, tmp[j]);
  }
 
+ if (num_srcs == 1) {
+tmp[1] = tmp[0];
+ }
+
  /* relying on clamping behavior of sse2 intrinsics here */
  lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
- hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
- dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+
+ if (num_srcs  4) {
+hi = lo;
+ }
+ else {
+hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], 
tmp[3]);
+ }
+ dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+  }
+  if (num_srcs  4) {
+ dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
   }
 
   return; 
}
 
-   /* Special case 2x8f -- 1x16ub
+   /* Special case 2x8f -- 1x16ub, 1x8f -1x8ub
 */
else if (src_type.floating == 1 
   src_type.fixed== 0 
@@ -671,20 +685,23 @@ lp_build_conv(struct gallivm_state *gallivm,
   dst_type.sign == 0 
   dst_type.norm == 1 
   dst_type.width== 8 
-  dst_type.length   == 16 
 
-  2 * num_dsts  == num_srcs 
+  ((dst_type.length == 16  2 * num_dsts == num_srcs) ||
+   (num_dsts == 1  dst_type.length * num_srcs == 8)) 
 
   util_cpu_caps.has_avx) {
 
   struct lp_build_context bld;
-  struct lp_type int16_type = dst_type;
-  struct lp_type int32_type = dst_type;
+  struct lp_type int16_type, int32_type;
+  

[Mesa-dev] [PATCH 3/4] llvmpipe: cleanup of generate_unswizzled_blend

2013-06-03 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Some parameters were used inconsistently, for instance not using
block_width/block_height/block_size for deferring number of pixels
but rather relying on guesses from the number of fragment shaders etc,
so fix this up (no actual change in behavior since the block size stays
fixed). (Though most of the code would work with different block_height,
with three exceptions, one being the hacked r11g11b10 conversions and
twiddle code which only work with block_height 2 not 1, and the last
one being blend vector type not being 128bit wide.)
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c |   59 +---
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index a20cc78..a7bd836 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -100,6 +100,7 @@
 #include lp_tex_sample.h
 #include lp_flush.h
 #include lp_state_fs.h
+#include lp_rast.h
 
 
 /** Fragment shader number (for debugging) */
@@ -528,7 +529,7 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
bool twiddle;
bool split;
 
-   unsigned pixels = num_fs == 4 ? 1 : 2;
+   unsigned pixels = type.length / 4;
unsigned reorder_group;
unsigned src_channels;
unsigned src_count;
@@ -537,7 +538,7 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
src_channels = dst_channels  3 ? dst_channels : 4;
src_count = num_fs * src_channels;
 
-   assert(pixels == 2 || num_fs == 4);
+   assert(pixels == 2 || pixels == 1);
assert(num_fs * src_channels = Elements(src));
 
/*
@@ -917,6 +918,7 @@ scale_bits(struct gallivm_state *gallivm,
  */
 static void
 convert_to_blend_type(struct gallivm_state *gallivm,
+  unsigned block_size,
   const struct util_format_description *src_fmt,
   struct lp_type src_type,
   struct lp_type dst_type,
@@ -928,7 +930,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
struct lp_type blend_type;
struct lp_type mem_type;
unsigned i, j, k;
-   unsigned pixels = 16 / num_srcs;
+   unsigned pixels = block_size / num_srcs;
bool is_arith;
 
/*
@@ -945,13 +947,15 @@ convert_to_blend_type(struct gallivm_state *gallivm,
   assert(dst_type.floating);
   assert(dst_type.width == 32);
   assert(dst_type.length % 4 == 0);
+  assert(num_srcs % 4 == 0);
+
   for (i = 0; i  4; i++) {
  tmpsrc[i] = src[i];
   }
   for (i = 0; i  num_srcs / 4; i++) {
  LLVMValueRef tmpsoa[4];
  LLVMValueRef tmps = tmpsrc[i];
- if (num_srcs == 8) {
+ if (dst_type.length == 8) {
 LLVMValueRef shuffles[8];
 unsigned j;
 /* fetch was 4 values but need 8-wide output values */
@@ -1062,6 +1066,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
  */
 static void
 convert_from_blend_type(struct gallivm_state *gallivm,
+unsigned block_size,
 const struct util_format_description *src_fmt,
 struct lp_type src_type,
 struct lp_type dst_type,
@@ -1073,7 +1078,7 @@ convert_from_blend_type(struct gallivm_state *gallivm,
struct lp_type mem_type;
struct lp_type blend_type;
LLVMBuilderRef builder = gallivm-builder;
-   unsigned pixels = 16 / num_srcs;
+   unsigned pixels = block_size / num_srcs;
bool is_arith;
 
/*
@@ -1090,11 +1095,12 @@ convert_from_blend_type(struct gallivm_state *gallivm,
   assert(src_type.width == 32);
   assert(src_type.length % 4 == 0);
   assert(dst_type.width == 32);
+
   for (i = 0; i  num_srcs / 4; i++) {
  LLVMValueRef tmpsoa[4], tmpdst;
  lp_build_transpose_aos(gallivm, src_type, src[i * 4], tmpsoa);
  tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
- if (num_srcs == 8) {
+ if (src_type.length == 8) {
 LLVMValueRef tmpaos, shuffles[8];
 unsigned j;
 /*
@@ -1228,9 +1234,13 @@ convert_alpha(struct gallivm_state *gallivm,
row_type.length = alpha_type.length;
 
/* Twiddle the alpha to match pixels */
-   lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha);
+   lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, 
src_alpha);
 
-   for (i = 0; i  4; ++i) {
+   /*
+* TODO this should use single lp_build_conv call for
+* src_count == 1  dst_channels == 1 case (dropping the concat below)
+*/
+   for (i = 0; i  block_height; ++i) {
   lp_build_conv(gallivm, alpha_type, row_type, src_alpha[i], 1, 
src_alpha[i], 1);
}
 
@@ -1238,10 +1248,9 @@ convert_alpha(struct gallivm_state *gallivm,
row_type.length = length;
 
/* If only one channel we can only need the single alpha value per pixel */
-   if (src_count == 1) {
-  

[Mesa-dev] [PATCH 4/4] llvmpipe: reduce alignment requirement for 1d resources from 4x4 to 4x1

2013-06-03 Thread sroland
From: Roland Scheidegger srol...@vmware.com

For rendering to buffers, we cannot have any y alignment.
So make sure that tile clear commands only clear up to the fb width/height,
not more (do this for all resources actually as clearing more seems
pointless for other resources too). For the jit fs function, skip execution
of the lower half of the fragment shader for the 4x4 stamp completely,
for depth/stencil only load/store the values from the first row
(replace other row with undef).
For the blend function, also only load half the values from fs output,
replace the rest with undefs so that everything still operates on the
full 4x4 block to keep code the same between 4x1 and 4x4 (except for
load/store of course which also needs to skip (store) or replace these
values with undefs (load))., at the cost of slightly less optimal code
being produced in some cases.
Also reduce 1d and 1d array alignment too, because they can be handled the
same as buffers so don't need to waste memory.

v2: don't try to run special blend code for 4x1, (very) slightly less
complexity if we just use the same code as for 4x4 which may or may not
make it easier to optimize in the future (as we care a lot more about 4x4
performance than 1d).
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |   19 -
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |2 +
 src/gallium/drivers/llvmpipe/lp_rast.c  |8 +-
 src/gallium/drivers/llvmpipe/lp_scene.c |2 -
 src/gallium/drivers/llvmpipe/lp_scene.h |4 -
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  109 +--
 src/gallium/drivers/llvmpipe/lp_state_fs.h  |1 +
 src/gallium/drivers/llvmpipe/lp_texture.c   |   24 --
 src/gallium/drivers/llvmpipe/lp_texture.h   |   21 ++
 9 files changed, 146 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index df6a6c4..a8bd15f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -525,6 +525,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
  *
  * \param type  the data type of the fragment depth/stencil values
  * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
  * \param loop_counter  the current loop iteration
  * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
  * \param depth_stride  stride of the depth/stencil buffer
@@ -535,6 +536,7 @@ void
 lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
  struct lp_type z_src_type,
  const struct util_format_description 
*format_desc,
+ boolean is_1d,
  LLVMValueRef depth_ptr,
  LLVMValueRef depth_stride,
  LLVMValueRef *z_fb,
@@ -592,9 +594,14 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state 
*gallivm,
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, depth_offset1, 1, );
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, );
zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, );
-   zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, depth_offset2, 1, );
-   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, );
-   zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, );
+   if (is_1d) {
+  zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+   }
+   else {
+  zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, depth_offset2, 1, );
+  zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, );
+  zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, );
+   }
 
*z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
   LLVMConstVector(shuffles, zs_type.length), 
);
@@ -648,6 +655,7 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state 
*gallivm,
  *
  * \param type  the data type of the fragment depth/stencil values
  * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param z_fb  z values read from fb (with padding)
  * \param s_fb  s values read from fb (with padding)
@@ -661,6 +669,7 @@ void
 lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
   struct lp_type z_src_type,
   const struct util_format_description 
*format_desc,
+  boolean is_1d,
   struct lp_build_mask_context *mask,
   LLVMValueRef z_fb,
   LLVMValueRef s_fb,
@@ -791,7 +800,9 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state 
*gallivm,
}
 

[Mesa-dev] [PATCH] llvmpipe: reduce alignment requirement for 1d resources from 4x4 to 4x1

2013-05-31 Thread sroland
From: Roland Scheidegger srol...@vmware.com

For rendering to buffers, we cannot have any y alignment.
So make sure that tile clear commands only clear up to the fb width/height,
not more (do this for all resources actually as clearing more seems
pointless for other resources too). For the jit fs function, skip execution
of the lower half of the fragment shader for the 4x4 stamp completely,
for depth/stencil only load/store the values from the first row
(replace other row with undef).
For the blend function, also only load half the values from fs output, drop
the second row after untwiddling (fix up some issues there due to inconsistent
usage of block_width/block_height/block_size, num_fs and fs type length).
Also reduce 1d and 1d array alignment too, because they can be handled the
same as buffers so don't need to waste memory.
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |   90 +++
 src/gallium/auxiliary/gallivm/lp_bld_pack.c |6 +-
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |   19 ++-
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |2 +
 src/gallium/drivers/llvmpipe/lp_rast.c  |8 +-
 src/gallium/drivers/llvmpipe/lp_scene.c |2 -
 src/gallium/drivers/llvmpipe/lp_scene.h |4 -
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  228 +++
 src/gallium/drivers/llvmpipe/lp_state_fs.h  |1 +
 src/gallium/drivers/llvmpipe/lp_texture.c   |   24 ++-
 src/gallium/drivers/llvmpipe/lp_texture.h   |   21 +++
 11 files changed, 281 insertions(+), 124 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index eb2d096..f11361a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -530,24 +530,22 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
dst_type-width== 8)
{
   /* Special case 4x4f -- 1x16ub */
-  if (src_type.length == 4  util_cpu_caps.has_sse2)
+  if (src_type.length == 4 
+  util_cpu_caps.has_sse2)
   {
- assert((num_srcs % 4) == 0);
-
- num_dsts = num_srcs / 4;
- dst_type-length = 16;
+ num_dsts = (num_srcs + 3) / 4;
+ dst_type-length = num_srcs * 4 = 16 ? 16 : num_srcs * 4;
 
  lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, 
num_dsts);
  return num_dsts;
   }
 
   /* Special case 2x8f -- 1x16ub */
-  if (src_type.length == 8  util_cpu_caps.has_avx)
+  if (src_type.length == 8 
+  util_cpu_caps.has_avx)
   {
- assert((num_srcs % 2) == 0);
-
- num_dsts = num_srcs / 2;
- dst_type-length = 16;
+ num_dsts = (num_srcs + 1) / 2;
+ dst_type-length = num_srcs * 8 = 16 ? 16 : num_srcs * 8;
 
  lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, 
num_dsts);
  return num_dsts;
@@ -602,7 +600,7 @@ lp_build_conv(struct gallivm_state *gallivm,
num_tmps = num_srcs;
 
 
-   /* Special case 4x4f -- 1x16ub 
+   /* Special case 4x4f -- 1x16ub, 2x4f - 1x8ub, 1x4f - 1x4ub
 */
if (src_type.floating == 1 
src_type.fixed== 0 
@@ -616,20 +614,23 @@ lp_build_conv(struct gallivm_state *gallivm,
dst_type.sign == 0 
dst_type.norm == 1 
dst_type.width== 8 
-   dst_type.length   == 16 
 
-   4 * num_dsts  == num_srcs 
+   ((dst_type.length == 16  4 * num_dsts == num_srcs) ||
+(num_dsts == 1  dst_type.length * num_srcs == 16  num_srcs != 3)) 

 
util_cpu_caps.has_sse2)
{
   struct lp_build_context bld;
-  struct lp_type int16_type = dst_type;
-  struct lp_type int32_type = dst_type;
+  struct lp_type int16_type, int32_type;
+  struct lp_type dst_type_ext = dst_type;
   LLVMValueRef const_255f;
   unsigned i, j;
 
   lp_build_context_init(bld, gallivm, src_type);
 
+  dst_type_ext.length = 16;
+  int16_type = int32_type = dst_type_ext;
+
   int16_type.width *= 2;
   int16_type.length /= 2;
   int16_type.sign = 1;
@@ -643,21 +644,34 @@ lp_build_conv(struct gallivm_state *gallivm,
   for (i = 0; i  num_dsts; ++i, src += 4) {
  LLVMValueRef lo, hi;
 
- for (j = 0; j  4; ++j) {
+ for (j = 0; j  dst_type.length / 4; ++j) {
 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, );
 tmp[j] = lp_build_iround(bld, tmp[j]);
  }
 
+ if (num_srcs == 1) {
+tmp[1] = tmp[0];
+ }
+
  /* relying on clamping behavior of sse2 intrinsics here */
  lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
- hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
- dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+
+ if (num_srcs  4) {
+hi = lo;
+ }
+ else {
+hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], 

[Mesa-dev] [PATCH] llvmpipe: fix bogus assertions for buffer surfaces

2013-05-31 Thread sroland
From: Roland Scheidegger srol...@vmware.com

One of the assertion made no sense for buffer rendertargets
(due to the union), so drop it. (The same assertion is present already in
the path for texture surfaces later.).
---
 src/gallium/drivers/llvmpipe/lp_texture.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index a02ddbc..b959e82 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -485,7 +485,6 @@ llvmpipe_create_surface(struct pipe_context *pipe,
 {
struct pipe_surface *ps;
 
-   assert(surf_tmpl-u.tex.level = pt-last_level);
if (!(pt-bind  (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET)))
   debug_printf(Illegal surface creation without bind flag\n);
 
@@ -513,7 +512,8 @@ llvmpipe_create_surface(struct pipe_context *pipe,
  ps-u.buf.first_element = surf_tmpl-u.buf.first_element;
  ps-u.buf.last_element = surf_tmpl-u.buf.last_element;
  assert(ps-u.buf.first_element = ps-u.buf.last_element);
- assert(ps-u.buf.last_element  ps-width);
+ assert(util_format_get_blocksize(surf_tmpl-format) *
+ps-u.buf.last_element  pt-width0);
   }
}
return ps;
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallium: add support for layered rendering

2013-05-31 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Since pipe_surface already has all the necessary fields no interface
changes are necessary except adding a new shader semantic value
(TGSI_SEMANTIC_LAYER), though add a pipe capability bit for it as well.
(Note that what GL knows as gl_Layer variable d3d10 is naming
RENDER_TARGET_ARRAY_INDEX)
---
 src/gallium/docs/source/screen.rst |2 ++
 src/gallium/include/pipe/p_defines.h   |3 ++-
 src/gallium/include/pipe/p_shader_tokens.h |3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/docs/source/screen.rst 
b/src/gallium/docs/source/screen.rst
index 683080c..b74b237 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -168,6 +168,8 @@ The integer capabilities:
   since they are linked) a driver can support. Returning 0 is equivalent
   to returning 1 because every driver has to support at least a single
   viewport/scissor combination.  
+* ``PIPE_CAP_LAYERED_RENDERING``: Whether rendering to multiple layers is
+  supported using layer selection by the TGSI_SEMANTIC_LAYER shader variable.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/include/pipe/p_defines.h 
b/src/gallium/include/pipe/p_defines.h
index 8af1a84..c359a9e 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -508,7 +508,8 @@ enum pipe_cap {
PIPE_CAP_QUERY_PIPELINE_STATISTICS = 81,
PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK = 82,
PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE = 83,
-   PIPE_CAP_MAX_VIEWPORTS = 84
+   PIPE_CAP_MAX_VIEWPORTS = 84,
+   PIPE_CAP_MULTIPLE_LAYERS = 85
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1  0)
diff --git a/src/gallium/include/pipe/p_shader_tokens.h 
b/src/gallium/include/pipe/p_shader_tokens.h
index b33cf1d..c984d50 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -165,7 +165,8 @@ struct tgsi_declaration_interp
 #define TGSI_SEMANTIC_TEXCOORD   19 /** texture or sprite coordinates */
 #define TGSI_SEMANTIC_PCOORD 20 /** point sprite coordinate */
 #define TGSI_SEMANTIC_VIEWPORT_INDEX 21 /** viewport index */
-#define TGSI_SEMANTIC_COUNT  22 /** number of semantic values */
+#define TGSI_SEMANTIC_LAYER  22 /** layer (rendertarget index) */
+#define TGSI_SEMANTIC_COUNT  23 /** number of semantic values */
 
 struct tgsi_declaration_semantic
 {
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix out-of-bounds access with mirror_clamp_to_edge address mode

2013-05-31 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Surprising this bug survived so long, we were missing a clamp (in the
linear filtering version).
(Valgrind complained a lot about invalid reads with piglit texwrap,
I've also seen spurios failures in this test which might have
happened due to this. Valgrind probably didn't complain before the
alignment reduction in llvmpipe to 4x4 since the test is using tiny
textures so the reads were still always well within allocated area.)
While here, also do an effective clamp (after half subtraction)
of [0,length-0.5] instead of [0, length-1] which saves an instruction
(the filtering weight could be different due to this, but only if
both texels point to the same max texel so it doesn't matter).
(Both changes are borrowed from PIPE_TEX_CLAMP_TO_EDGE case.)

Note: This is a candidate for the stable branches.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |   13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 7ac0029..e0a59d0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -436,7 +436,6 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context 
*bld,
 
case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
   {
- LLVMValueRef min, max;
  struct lp_build_context abs_coord_bld = bld-coord_bld;
  abs_coord_bld.type.sign = FALSE;
 
@@ -450,16 +449,18 @@ lp_build_sample_wrap_linear(struct 
lp_build_sample_context *bld,
  }
  coord = lp_build_abs(coord_bld, coord);
 
- /* clamp to [0.5, length - 0.5] */
- min = half;
- max = lp_build_sub(coord_bld, length_f, min);
- coord = lp_build_clamp(coord_bld, coord, min, max);
-
+ /* clamp to length max */
+ coord = lp_build_min(coord_bld, coord, length_f);
+ /* subtract 0.5 */
  coord = lp_build_sub(coord_bld, coord, half);
+ /* clamp to [0, length - 0.5] */
+ coord = lp_build_max(coord_bld, coord, coord_bld-zero);
 
  /* convert to int, compute lerp weight */
  lp_build_ifloor_fract(abs_coord_bld, coord, coord0, weight);
  coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld-one);
+ /* coord1 = min(coord1, length-1) */
+ coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
   }
   break;
 
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: get rid of tiled/linear layout remains

2013-05-28 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Eliminate the rest of the no longer needed layout logic.
(It is possible some code could be simplified a bit further still.)
---
 src/gallium/drivers/llvmpipe/lp_scene.c |6 +-
 src/gallium/drivers/llvmpipe/lp_setup.c |6 +-
 src/gallium/drivers/llvmpipe/lp_state_sampler.c |6 +-
 src/gallium/drivers/llvmpipe/lp_surface.c   |6 +-
 src/gallium/drivers/llvmpipe/lp_texture.c   |  197 +++
 src/gallium/drivers/llvmpipe/lp_texture.h   |   52 ++
 6 files changed, 47 insertions(+), 226 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c 
b/src/gallium/drivers/llvmpipe/lp_scene.c
index e05ea75..3a3ba75 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -163,8 +163,7 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
  scene-cbufs[i].map = llvmpipe_resource_map(cbuf-texture,
  cbuf-u.tex.level,
  cbuf-u.tex.first_layer,
- LP_TEX_USAGE_READ_WRITE,
- LP_TEX_LAYOUT_LINEAR);
+ LP_TEX_USAGE_READ_WRITE);
   }
   else {
  struct llvmpipe_resource *lpr = llvmpipe_resource(cbuf-texture);
@@ -184,8 +183,7 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
   scene-zsbuf.map = llvmpipe_resource_map(zsbuf-texture,
zsbuf-u.tex.level,
zsbuf-u.tex.first_layer,
-   LP_TEX_USAGE_READ_WRITE,
-   LP_TEX_LAYOUT_LINEAR);
+   LP_TEX_USAGE_READ_WRITE);
}
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index 9fef34e..a6dce24 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -704,8 +704,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
 * still are tiled.
 */
mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level,
-LP_TEX_USAGE_READ,
-LP_TEX_LAYOUT_LINEAR);
+LP_TEX_USAGE_READ);
jit_tex-base = lp_tex-linear_img.data;
 }
 else {
@@ -736,8 +735,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
if (llvmpipe_resource_is_texture(res)) {
   for (j = first_level; j = last_level; j++) {
  mip_ptr = llvmpipe_get_texture_image_all(lp_tex, j,
-  
LP_TEX_USAGE_READ,
-  
LP_TEX_LAYOUT_LINEAR);
+  
LP_TEX_USAGE_READ);
  jit_tex-mip_offsets[j] = (uint8_t *)mip_ptr - (uint8_t 
*)jit_tex-base;
  /*
   * could get mip offset directly but need call above to
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 6b7e327..ee2e444 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -300,14 +300,12 @@ prepare_shader_sampling(
/* must trigger allocation first before we can get base ptr */
/* XXX this may fail due to OOM ? */
mip_ptr = llvmpipe_get_texture_image_all(lp_tex, 
view-u.tex.first_level,
-LP_TEX_USAGE_READ,
-LP_TEX_LAYOUT_LINEAR);
+LP_TEX_USAGE_READ);
addr = lp_tex-linear_img.data;
 
for (j = first_level; j = last_level; j++) {
   mip_ptr = llvmpipe_get_texture_image_all(lp_tex, j,
-   LP_TEX_USAGE_READ,
-   
LP_TEX_LAYOUT_LINEAR);
+   LP_TEX_USAGE_READ);
   mip_offsets[j] = (uint8_t *)mip_ptr - (uint8_t *)addr;
   /*
* could get mip offset directly but need call above to
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c 
b/src/gallium/drivers/llvmpipe/lp_surface.c
index 5e6a6eb..961d0bc 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ 

[Mesa-dev] [PATCH] llvmpipe: reduce alignment requirement for resources from 64x64 to 4x4

2013-05-28 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The overallocation was very bad especially for things like 1d array
textures which got blown up by a factor of 64. (Even ordinary smallish
2d textures benefit a lot from this, a mipmapped 64x64 rgba8 texture
previously used 7*16kB = 112kB instead of now ~22kB.)
4x4 is chosen because this is the size the jit functions run on, so
making it smaller is going to be a bit more complicated.
It is actually not strictly 4x4 pixel, since we'd want to avoid situations
where different threads are rendering to the same cacheline so we keep
cacheline size alignment in x direction (often 64bytes).
To make this work introduce new task width/height parameters and make
sure clears don't clear the whole tile if it's a partial tile. Likewise,
the rasterizer may produce fragments outside the 4x4 blocks present in a
tile, so don't call the jit function for them.
This does not yet fix rendering to buffers (which cannot have any y
alignment at all), and 1d/1d array textures are still overallocated by a
factor of 4.
---
 src/gallium/drivers/llvmpipe/lp_rast.c  |   56 ---
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |   37 +++---
 src/gallium/drivers/llvmpipe/lp_scene.c |2 +
 src/gallium/drivers/llvmpipe/lp_scene.h |4 ++
 src/gallium/drivers/llvmpipe/lp_setup.c |3 +-
 src/gallium/drivers/llvmpipe/lp_texture.c   |   26 ++---
 6 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c 
b/src/gallium/drivers/llvmpipe/lp_rast.c
index 5c837a0..be5a286 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -95,6 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
task-bin = bin;
task-x = x * TILE_SIZE;
task-y = y * TILE_SIZE;
+   task-width = TILE_SIZE + x * TILE_SIZE  task-scene-width_aligned ?
+task-scene-width_aligned - x * TILE_SIZE : TILE_SIZE;
+   task-height = TILE_SIZE + y * TILE_SIZE  task-scene-height_aligned ?
+task-scene-height_aligned - y * TILE_SIZE : TILE_SIZE;
 
/* reset pointers to color and depth tile(s) */
memset(task-color_tiles, 0, sizeof(task-color_tiles));
@@ -144,8 +148,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
scene-cbufs[i].stride,
task-x,
task-y,
-   TILE_SIZE,
-   TILE_SIZE,
+   task-width,
+   task-height,
uc);
  }
   }
@@ -172,8 +176,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
scene-cbufs[i].stride,
task-x,
task-y,
-   TILE_SIZE,
-   TILE_SIZE,
+   task-width,
+   task-height,
uc);
  }
   }
@@ -198,8 +202,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
uint64_t clear_mask64 = arg.clear_zstencil.mask;
uint32_t clear_value = (uint32_t) clear_value64;
uint32_t clear_mask = (uint32_t) clear_mask64;
-   const unsigned height = TILE_SIZE;
-   const unsigned width = TILE_SIZE;
+   const unsigned height = task-height;
+   const unsigned width = task-width;
const unsigned block_size = scene-zsbuf.blocksize;
const unsigned dst_stride = scene-zsbuf.stride;
uint8_t *dst;
@@ -325,8 +329,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
variant = state-variant;
 
/* render the whole 64x64 tile in 4x4 chunks */
-   for (y = 0; y  TILE_SIZE; y += 4){
-  for (x = 0; x  TILE_SIZE; x += 4) {
+   for (y = 0; y  task-height; y += 4){
+  for (x = 0; x  task-width; x += 4) {
  uint8_t *color[PIPE_MAX_COLOR_BUFS];
  unsigned stride[PIPE_MAX_COLOR_BUFS];
  uint8_t *depth = NULL;
@@ -434,21 +438,27 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
 
assert(lp_check_alignment(state-jit_context.u8_blend_color, 16));
 
-   /* run shader on 4x4 block */
-   BEGIN_JIT_CALL(state, task);
-   variant-jit_function[RAST_EDGE_TEST](state-jit_context,
- x, y,
- inputs-frontfacing,
- GET_A0(inputs),
- GET_DADX(inputs),
- GET_DADY(inputs),
- color,
- depth,
- mask,
- task-thread_data,
- stride,
- depth_stride);
-   END_JIT_CALL();
+   /*
+* The rasterizer may produce fragments 

[Mesa-dev] [PATCH 1/5] llvmpipe: fix bug in early depth test / late depth write handling

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Using wrong type if the format was less than 32bits.
No piglit changes as it doesn't hit that path.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |   13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 2376ca7..08138f0 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -680,15 +680,15 @@ lp_build_depth_stencil_write_swizzled(struct 
gallivm_state *gallivm,
LLVMTypeRef load_ptr_type;
unsigned depth_bytes = format_desc-block.bits / 8;
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type z_type = zs_type;
struct lp_type zs_load_type = zs_type;
 
zs_load_type.length = zs_load_type.length / 2;
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 
0);
 
-   if (zs_type.width  32)
-  zs_type.width = 32;
+   z_type.width = z_src_type.width;
 
-   lp_build_context_init(z_bld, gallivm, zs_type);
+   lp_build_context_init(z_bld, gallivm, z_type);
 
/*
 * This is far from ideal, at least for late depth write we should do this
@@ -742,7 +742,8 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state 
*gallivm,
 
if (zs_type.width  z_src_type.width) {
   /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
-  z_value = LLVMBuildTrunc(builder, z_value, z_bld.vec_type, );
+  z_value = LLVMBuildTrunc(builder, z_value,
+   lp_build_int_vec_type(gallivm, zs_type), );
}
 
if (format_desc-block.bits = 32) {
@@ -762,9 +763,9 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state 
*gallivm,
}
else {
   if (z_src_type.length == 4) {
- zs_dst1 = lp_build_interleave2(gallivm, zs_type,
+ zs_dst1 = lp_build_interleave2(gallivm, z_type,
 z_value, s_value, 0);
- zs_dst2 = lp_build_interleave2(gallivm, zs_type,
+ zs_dst2 = lp_build_interleave2(gallivm, z_type,
 z_value, s_value, 1);
   }
   else {
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/5] llvmpipe: (trivial) remove confusing code in stencil test

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This was meant to disable some code which isn't needed when depth/stencil
isn't written. However, there's more code which wouldn't be needed in that
case so having the condition there was just odd (llvm will drop all the code
anyway).
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |   27 +++
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 08138f0..5ef9947 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1097,23 +1097,18 @@ lp_build_depth_stencil_test(struct gallivm_state 
*gallivm,
   stencil_shift, );
 
/* Finally, merge the z/stencil values */
-   if ((depth-enabled  depth-writemask) ||
-   (stencil[0].enabled  (stencil[0].writemask ||
-   (stencil[1].enabled  stencil[1].writemask 
{
-
-  if (format_desc-block.bits = 32) {
- if (have_z  have_s)
-*z_value = LLVMBuildOr(builder, z_dst, stencil_vals, );
- else if (have_z)
-*z_value = z_dst;
- else
-*z_value = stencil_vals;
- *s_value = *z_value;
-  }
-  else {
+   if (format_desc-block.bits = 32) {
+  if (have_z  have_s)
+ *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, );
+  else if (have_z)
  *z_value = z_dst;
- *s_value = stencil_vals;
-  }
+  else
+ *z_value = stencil_vals;
+  *s_value = *z_value;
+   }
+   else {
+  *z_value = z_dst;
+  *s_value = stencil_vals;
}
 
if (s_pass_mask)
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/5] llvmpipe: fix issue with not writing new stencil values

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We did mask checks between depth/stencil testing and depth/stencil write.
This meant that if the depth/stencil test killed off all fragments we never
actually wrote the new stencil value. This issue affected all early/late
test/write combinations.
So move the mask check after depth/stencil write (for early depth test,
could do the same for late depth test but might not be worth it at that
point so just skip it there).
This addresses https://bugs.freedesktop.org/show_bug.cgi?id=41787.
Piglit does not hit this issue because of the simple_shader optimization
in generate_fs_loop() which means we're skipping the mask checks.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |4 
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |3 +++
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 5ef9947..df6a6c4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1116,9 +1116,5 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
 
if (depth-enabled  stencil[0].enabled)
   lp_build_mask_update(mask, z_pass);
-
-   if (do_branch)
-  lp_build_mask_check(mask);
-
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 1dfc75a..b1696ee 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -361,6 +361,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
depth_ptr, depth_stride,
z_value, s_value);
   }
+
+  if (!simple_shader  key-stencil[0].enabled)
+ lp_build_mask_check(mask);
}
 
lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/5] llvmpipe: fix early depth test / late depth write stencil issues

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We actually did early depth/stencil test and late depth/stencil write even
when the shader could kill the fragment (alpha test or discard). Since it
matters for the new stencil value if the fragment is killed by depth/stencil
test or by the shader (in which case it will not reach the depth/stencil
test) this simply cannot work (we also would possibly skip writing the new
stencil value due to mask checks but this is a secondary issue).
So use late depth test / late depth write instead in this case.
(No piglit changes as it doesn't seem to hit such bogus early depth test
/ late depth write path.)
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c |   17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b1696ee..9661273 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -266,13 +266,20 @@ generate_fs_loop(struct gallivm_state *gallivm,
   assert(zs_format_desc);
 
   if (!shader-info.base.writes_z) {
- if (key-alpha.enabled || shader-info.base.uses_kill)
+ if (key-alpha.enabled || shader-info.base.uses_kill) {
 /* With alpha test and kill, can do the depth test early
  * and hopefully eliminate some quads.  But need to do a
  * special deferred depth write once the final mask value
- * is known.
+ * is known. This only works though if there's either no
+ * stencil test or the stencil value isn't written.
  */
-depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+if (key-stencil[0].enabled  (key-stencil[0].writemask ||
+(key-stencil[1].enabled 
+ key-stencil[1].writemask)))
+   depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+else
+   depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+ }
  else
 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
   }
@@ -281,9 +288,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
   }
 
   if (!(key-depth.enabled  key-depth.writemask) 
-  !((key-stencil[0].enabled  (key-stencil[0].writemask ||
+  !(key-stencil[0].enabled  (key-stencil[0].writemask ||
 (key-stencil[1].enabled 
- key-stencil[1].writemask)
+ key-stencil[1].writemask
  depth_mode = ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
}
else {
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/5] llvmpipe: disable simple_shader optimization

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This optimization disabled mask checks if the shader is simple enough.
While this should work correctly, the problem is that it can hide real issues
because shaders in practice are usually complex enough (8 instructions or 1
texture is already enough) so this doesn't get used, whereas dumbed-down
tests which should hit all the same code paths suddenly do something quite
different. This was the reason that bug 41787 could not be easily tracked as
stencil test not working correctly (piglit would in fact have failed some
tests without that optimization).
So disable it for now, it's unclear if it's much of a win in any case.
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 9661273..b06f915 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -247,7 +247,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
struct lp_build_mask_context mask;
boolean simple_shader = (shader-info.base.file_count[TGSI_FILE_SAMPLER] == 
0 
 shader-info.base.num_inputs  3 
-shader-info.base.num_instructions  8);
+shader-info.base.num_instructions  8)  0;
const boolean dual_source_blend = key-blend.rt[0].blend_enable 
  util_blend_state_is_dual(key-blend, 0);
unsigned attrib;
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] softpipe: disambiguate TILE_SIZE / TEX_TILE_SIZE

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

These can be different (just like NUM_TEX_TILE_ENTRIES / NUM_ENTRIES),
though currently they aren't.
---
 src/gallium/drivers/softpipe/sp_tex_sample.c |   28 +++---
 src/gallium/drivers/softpipe/sp_tex_tile_cache.c |   28 +++---
 src/gallium/drivers/softpipe/sp_tex_tile_cache.h |   20 
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 1550199..2c7f17f 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -580,10 +580,10 @@ get_texel_2d_no_border(const struct sp_sampler_view 
*sp_sview,
union tex_tile_address addr, int x, int y)
 {
const struct softpipe_tex_cached_tile *tile;
-   addr.bits.x = x / TILE_SIZE;
-   addr.bits.y = y / TILE_SIZE;
-   y %= TILE_SIZE;
-   x %= TILE_SIZE;
+   addr.bits.x = x / TEX_TILE_SIZE;
+   addr.bits.y = y / TEX_TILE_SIZE;
+   y %= TEX_TILE_SIZE;
+   x %= TEX_TILE_SIZE;
 
tile = sp_get_cached_tile_tex(sp_sview-cache, addr);
 
@@ -722,10 +722,10 @@ get_texel_quad_2d_no_border_single_tile(const struct 
sp_sampler_view *sp_sview,
 {
 const struct softpipe_tex_cached_tile *tile;
 
-   addr.bits.x = x / TILE_SIZE;
-   addr.bits.y = y / TILE_SIZE;
-   y %= TILE_SIZE;
-   x %= TILE_SIZE;
+   addr.bits.x = x / TEX_TILE_SIZE;
+   addr.bits.y = y / TEX_TILE_SIZE;
+   y %= TEX_TILE_SIZE;
+   x %= TEX_TILE_SIZE;
 
tile = sp_get_cached_tile_tex(sp_sview-cache, addr);
   
@@ -777,11 +777,11 @@ get_texel_3d_no_border(const struct sp_sampler_view 
*sp_sview,
 {
const struct softpipe_tex_cached_tile *tile;
 
-   addr.bits.x = x / TILE_SIZE;
-   addr.bits.y = y / TILE_SIZE;
+   addr.bits.x = x / TEX_TILE_SIZE;
+   addr.bits.y = y / TEX_TILE_SIZE;
addr.bits.z = z;
-   y %= TILE_SIZE;
-   x %= TILE_SIZE;
+   y %= TEX_TILE_SIZE;
+   x %= TEX_TILE_SIZE;
 
tile = sp_get_cached_tile_tex(sp_sview-cache, addr);
 
@@ -917,8 +917,8 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view 
*sp_sview,
 {
unsigned xpot = pot_level_size(sp_sview-xpot, level);
unsigned ypot = pot_level_size(sp_sview-ypot, level);
-   unsigned xmax = (xpot - 1)  (TILE_SIZE - 1); /* MIN2(TILE_SIZE, xpot) - 1; 
*/
-   unsigned ymax = (ypot - 1)  (TILE_SIZE - 1); /* MIN2(TILE_SIZE, ypot) - 1; 
*/
+   unsigned xmax = (xpot - 1)  (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, 
xpot) - 1; */
+   unsigned ymax = (ypot - 1)  (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, 
ypot) - 1; */
union tex_tile_address addr;
int c;
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c 
b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index af1024d..b0d8a18 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -50,7 +50,7 @@ sp_create_tex_tile_cache( struct pipe_context *pipe )
uint pos;
 
/* make sure max texture size works */
-   assert((TILE_SIZE  TEX_ADDR_BITS) = (1  (SP_MAX_TEXTURE_2D_LEVELS-1)));
+   assert((TEX_TILE_SIZE  TEX_ADDR_BITS) = (1  
(SP_MAX_TEXTURE_2D_LEVELS-1)));
 
tc = CALLOC_STRUCT( softpipe_tex_tile_cache );
if (tc) {
@@ -212,7 +212,7 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
 
if (addr.value != tile-addr.value) {
 
-  /* cache miss.  Most misses are because we've invaldiated the
+  /* cache miss.  Most misses are because we've invalidated the
* texture cache previously -- most commonly on binding a new
* texture.  Currently we effectively flush the cache on texture
* bind.
@@ -265,26 +265,26 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache 
*tc,
*/
   if (!zs  util_format_is_pure_uint(tc-format)) {
  pipe_get_tile_ui_format(tc-tex_trans, tc-tex_trans_map,
- addr.bits.x * TILE_SIZE,
- addr.bits.y * TILE_SIZE,
- TILE_SIZE,
- TILE_SIZE,
+ addr.bits.x * TEX_TILE_SIZE,
+ addr.bits.y * TEX_TILE_SIZE,
+ TEX_TILE_SIZE,
+ TEX_TILE_SIZE,
  tc-format,
  (unsigned *) tile-data.colorui);
   } else if (!zs  util_format_is_pure_sint(tc-format)) {
  pipe_get_tile_i_format(tc-tex_trans, tc-tex_trans_map,
-addr.bits.x * TILE_SIZE,
-addr.bits.y * TILE_SIZE,
-TILE_SIZE,
- TILE_SIZE,
+addr.bits.x * TEX_TILE_SIZE,
+addr.bits.y * TEX_TILE_SIZE,
+TEX_TILE_SIZE,
+

[Mesa-dev] [PATCH 2/2] softpipe: change TEX_TILE_SIZE and NUM_TEX_TILE_ENTRIES

2013-05-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Initially we had NUM_TEX_TILE_ENTRIES of 50, however this was using too much
memory (mostly because the tile cache is operating on fixed max current
sampler views which could be fixed but that's another topic). So it was
decreased to 4. However this is a ridiculously low number which can't
actually really work (the number of tiles needed for as little as
a single quad with linear_mipmap_linear is 2 to 8 for a 2d texture, and
4 to 16 for a 3d texture), as it just about guarantees there will be
cache thrashing sometimes (just about always for 3d textures in fact, since
while there are 4 entries the cache is direct mapped).
So increase that number to 16 (which is still on the low side for direct
mapped cache though I guess using something like 4-way associativity would
be more effective than increasing this further) which has at least some good
chance to avoid thrashing. Since we don't want to increase memory requirements
however in turn decrease the tile size accordingly from 64 to 32 (as a bonus
point this also decreases the cost of texture thrashing which might still
happen sometimes).
I've seen performance improvement in the order of factor ~200 (specifically,
drawing the first frame from the replay from bug 41787 needs only ~10s
instead of ~30min, meaning I can actually compare the output with other
drivers...) with this.
---
 src/gallium/drivers/softpipe/sp_tex_tile_cache.h |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h 
b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
index 0ea82b3..2fd6f12 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -40,7 +40,7 @@ struct softpipe_tex_tile_cache;
 /**
  * Cache tile size (width and height). This needs to be a power of two.
  */
-#define TEX_TILE_SIZE_LOG2 6
+#define TEX_TILE_SIZE_LOG2 5
 #define TEX_TILE_SIZE (1  TEX_TILE_SIZE_LOG2)
 
 
@@ -73,7 +73,7 @@ struct softpipe_tex_cached_tile
} data;
 };
 
-#define NUM_TEX_TILE_ENTRIES 4
+#define NUM_TEX_TILE_ENTRIES 16
 
 struct softpipe_tex_tile_cache
 {
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: fix stencil issues

2013-05-17 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Two (somewhat related) issues:
1) We did mask checks between depth/stencil testing and depth/stencil write.
This meant that if the depth/stencil test killed off all fragments we never
actually wrote the new stencil value. This issue affected all early/late
test/write combinations.
2) We actually did early depth/stencil test and late depth/stencil write even
when the shader could kill the fragment (alpha test or discard). Since it
matters for the new stencil value if the fragment is killed by depth/stencil
test or by the shader (in which case it will not reach the depth/stencil test)
this simply cannot work.
So fix these issues by moving the mask check after depth/stencil write (only
for early write it would work for late write too but probably not worth the
mask check there) and disable early depth test when it can't work correctly.
This addresses https://bugs.freedesktop.org/show_bug.cgi?id=41787 though
replaying the trace it still looks somewhat wrong to me, so maybe more bugs...
Verified this fixes affected piglit tests (glean stencil2 and some from hiz
group) if the simple_shader optimization in generate_fs_loop() is forced to
false (otherwise we skip mask checks hence don't hit issue 1 - I don't think
there's anything in piglit which would exhibit issue 2).
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |   31 ++-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |   20 -
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 2376ca7..afc2d9d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1096,23 +1096,18 @@ lp_build_depth_stencil_test(struct gallivm_state 
*gallivm,
   stencil_shift, );
 
/* Finally, merge the z/stencil values */
-   if ((depth-enabled  depth-writemask) ||
-   (stencil[0].enabled  (stencil[0].writemask ||
-   (stencil[1].enabled  stencil[1].writemask 
{
-
-  if (format_desc-block.bits = 32) {
- if (have_z  have_s)
-*z_value = LLVMBuildOr(builder, z_dst, stencil_vals, );
- else if (have_z)
-*z_value = z_dst;
- else
-*z_value = stencil_vals;
- *s_value = *z_value;
-  }
-  else {
+   if (format_desc-block.bits = 32) {
+  if (have_z  have_s)
+ *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, );
+  else if (have_z)
  *z_value = z_dst;
- *s_value = stencil_vals;
-  }
+  else
+ *z_value = stencil_vals;
+  *s_value = *z_value;
+   }
+   else {
+  *z_value = z_dst;
+  *s_value = stencil_vals;
}
 
if (s_pass_mask)
@@ -1120,9 +1115,5 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
 
if (depth-enabled  stencil[0].enabled)
   lp_build_mask_update(mask, z_pass);
-
-   if (do_branch)
-  lp_build_mask_check(mask);
-
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 1dfc75a..ae63615 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -266,13 +266,20 @@ generate_fs_loop(struct gallivm_state *gallivm,
   assert(zs_format_desc);
 
   if (!shader-info.base.writes_z) {
- if (key-alpha.enabled || shader-info.base.uses_kill)
+ if (key-alpha.enabled || shader-info.base.uses_kill) {
 /* With alpha test and kill, can do the depth test early
  * and hopefully eliminate some quads.  But need to do a
  * special deferred depth write once the final mask value
- * is known.
+ * is known. This only works though if there's either no
+ * stencil test or the stencil value isn't written.
  */
-depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+if (key-stencil[0].enabled  (key-stencil[0].writemask ||
+(key-stencil[1].enabled 
+ key-stencil[1].writemask)))
+   depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+else
+   depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+ }
  else
 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
   }
@@ -281,9 +288,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
   }
 
   if (!(key-depth.enabled  key-depth.writemask) 
-  !((key-stencil[0].enabled  (key-stencil[0].writemask ||
+  !(key-stencil[0].enabled  (key-stencil[0].writemask ||
 (key-stencil[1].enabled 
- key-stencil[1].writemask)
+ key-stencil[1].writemask
  depth_mode = 

[Mesa-dev] [PATCH 1/3] gallivm: handle z32s8x24 format for sampling

2013-05-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Since we can only sample either depth or stencil but not both only load
the required bits which makes things a bit easier (it requires special
handling since the format doesn't fit into 32bit).
The logic for deciding if depth or stencil should be sampled is a bit odd,
but seems to be what other drivers and statetrackers do: if it's a format with
both depth and stencil (or just with depth) then sample depth, for sampling
stencil a sampler view format with only stencil is required.
Also while here fix up stencil sampling for other formats as well, though
this isn't supported by mesa (ARB_stencil_texturing), and while blits would
use it they don't work neither since they'd also need stencil export.
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c |   59 ++---
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 54ca61a..eb50840 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -51,16 +51,25 @@ lp_build_format_swizzle_soa(const struct 
util_format_description *format_desc,
assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);
 
if (format_desc-colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+  enum util_format_swizzle swizzle;
+  LLVMValueRef depthorstencil;
+
+  if (util_format_has_stencil(format_desc) 
+  !util_format_has_depth(format_desc)) {
+ assert(!bld-type.floating);
+ swizzle = format_desc-swizzle[1];
+  }
+  else {
+ assert(bld-type.floating);
+ swizzle = format_desc-swizzle[0];
+  }
   /*
-   * Return zzz1 for depth-stencil formats.
-   *
-   * XXX: Allow to control the depth swizzle with an additional parameter,
-   * as the caller may wish another depth swizzle, or retain the stencil
-   * value.
+   * Return zzz1 or sss1 for depth-stencil formats here.
+   * Correct swizzling will be handled by apply_sampler_swizzle() later.
*/
-  enum util_format_swizzle swizzle = format_desc-swizzle[0];
-  LLVMValueRef depth = lp_build_swizzle_soa_channel(bld, unswizzled, 
swizzle);
-  swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth;
+  depthorstencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
+
+  swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depthorstencil;
   swizzled_out[3] = bld-one;
}
else {
@@ -392,6 +401,40 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
   return;
}
 
+   if (format_desc-colorspace == UTIL_FORMAT_COLORSPACE_ZS 
+   format_desc-block.bits == 64) {
+  /*
+   * special case the format is 64 bits but we only require
+   * 32bit (or 8bit) from each block.
+   */
+  LLVMValueRef packed;
+
+  if (format_desc-format == PIPE_FORMAT_X32_S8X24_UINT) {
+ /*
+  * for stencil simply fix up offsets - could in fact change
+  * base_ptr instead even outside the shader.
+  */
+ unsigned mask = (1  8) - 1;
+ LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
+ offset = LLVMBuildAdd(builder, offset, s_offset, );
+ packed = lp_build_gather(gallivm, type.length,
+  32, type.width, base_ptr, offset);
+ packed = LLVMBuildAnd(builder, packed,
+   lp_build_const_int_vec(gallivm, type, mask), 
);
+  }
+  else {
+ assert (format_desc-format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+ packed = lp_build_gather(gallivm, type.length,
+  32, type.width, base_ptr, offset);
+ packed = LLVMBuildBitCast(builder, packed,
+   lp_build_vec_type(gallivm, type), );
+  }
+  /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
+  rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
+  rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
+  return;
+   }
+
/*
 * Try calling lp_build_fetch_rgba_aos for all pixels.
 */
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] llvmpipe: handle z32s8x24 depth/stencil format

2013-05-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We need to split up the depth and stencil values in this case, and there's
some new logic required to handle float depth and stencil simultaneously.
Also make sure we get the 64bit zs clear values and masks propagated
correctly.
---
 src/gallium/auxiliary/gallivm/lp_bld_pack.c   |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |5 +
 src/gallium/drivers/llvmpipe/lp_bld_depth.c   |  269 +
 src/gallium/drivers/llvmpipe/lp_bld_depth.h   |   10 +-
 src/gallium/drivers/llvmpipe/lp_rast.c|   28 ++-
 src/gallium/drivers/llvmpipe/lp_rast.h|6 +-
 src/gallium/drivers/llvmpipe/lp_setup.c   |   18 +-
 src/gallium/drivers/llvmpipe/lp_setup_context.h   |4 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c|   42 ++--
 9 files changed, 242 insertions(+), 143 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c 
b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 9eb9ab0..0a57e39 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -255,7 +255,8 @@ lp_build_concat_n(struct gallivm_state *gallivm,
 /**
  * Interleave vector elements.
  *
- * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
+ * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
+ * (but not for 256bit AVX vectors).
  */
 LLVMValueRef
 lp_build_interleave2(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index beefdae..7be64bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1540,6 +1540,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
  bld.texel_type = lp_type_uint_vec(type.width, type.width * 
type.length);
   }
}
+   else if (util_format_has_stencil(bld.format_desc) 
+   !util_format_has_depth(bld.format_desc)) {
+  /* for stencil only formats, sample stencil (uint) */
+  bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
+   }
 
if (!static_texture_state-level_zero_only) {
   derived_sampler_state.min_mip_filter = 
static_sampler_state-min_mip_filter;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 1cd36b8..f03bfa8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -313,7 +313,7 @@ lp_depth_type(const struct util_format_description 
*format_desc,
   if (format_desc-channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
  type.floating = TRUE;
  assert(z_swizzle == 0);
- assert(format_desc-channel[z_swizzle].size == 
format_desc-block.bits);
+ assert(format_desc-channel[z_swizzle].size == 32);
   }
   else if(format_desc-channel[z_swizzle].type == 
UTIL_FORMAT_TYPE_UNSIGNED) {
  assert(format_desc-block.bits = 32);
@@ -347,15 +347,18 @@ static boolean
 get_z_shift_and_mask(const struct util_format_description *format_desc,
  unsigned *shift, unsigned *width, unsigned *mask)
 {
-   const unsigned total_bits = format_desc-block.bits;
+   unsigned total_bits;
unsigned z_swizzle;
unsigned chan;
unsigned padding_left, padding_right;
-   
+
assert(format_desc-colorspace == UTIL_FORMAT_COLORSPACE_ZS);
assert(format_desc-block.width == 1);
assert(format_desc-block.height == 1);
 
+   /* 64bit d/s format is special already extracted 32 bits */
+   total_bits = format_desc-block.bits  32 ? 32 : format_desc-block.bits;
+
z_swizzle = format_desc-swizzle[0];
 
if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
@@ -402,6 +405,14 @@ get_s_shift_and_mask(const struct util_format_description 
*format_desc,
if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
   return FALSE;
 
+   /* just special case 64bit d/s format */
+   if (format_desc-block.bits  32) {
+  assert(format_desc-format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+  *shift = 0;
+  *mask = 0xff;
+  return TRUE;
+   }
+
*shift = 0;
for (chan = 0; chan  s_swizzle; chan++)
   *shift += format_desc-channel[chan].size;
@@ -517,24 +528,29 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
  * \param loop_counter  the current loop iteration
  * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
  * \param depth_stride  stride of the depth/stencil buffer
+ * \param zs_dst  pointer to results (two values for 64bit ds format)
  */
-LLVMValueRef
+void
 lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
  struct lp_type z_src_type,
  const struct util_format_description 
*format_desc,
  LLVMValueRef depth_ptr,
  LLVMValueRef depth_stride,
+   

[Mesa-dev] [PATCH 3/3] llvmpipe: enable z32s8x24 format

2013-05-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Now that we can handle it both for sampling and as depth/stencil enable it.
Passes nearly all additional piglit tests which are now performed, with two
exceptions (one being a framebuffer blit which fails for all other formats
including stencil too as we don't support stencil blits, the other reporting
a unexpected GL error so doesn't look to be llvmpipe's fault).
---
 src/gallium/drivers/llvmpipe/lp_screen.c |6 --
 1 file changed, 6 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index 667ade1..699ca5f 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -361,12 +361,6 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
  return FALSE;
}
 
-   /* TODO: Support Z32_FLOAT_S8X24_UINT. See lp_bld_depth.c. */
-   if (format_desc-colorspace == UTIL_FORMAT_COLORSPACE_ZS 
-   format_desc-block.bits  32) {
-  return FALSE;
-   }
-
if (bind  PIPE_BIND_DEPTH_STENCIL) {
   if (format_desc-layout != UTIL_FORMAT_LAYOUT_PLAIN)
  return FALSE;
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: get rid of unused tiled/linear logic

2013-05-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We do rendering to linear color buffers for quite some time, and since
switching to linear depth buffers all the tiled/linear logic was unused.
So get rid of (most) of it - there's still some LAYOUT_NONE things and
late allocation of resources which probably could be simplified.
---
 src/gallium/drivers/llvmpipe/Makefile.am |3 +-
 src/gallium/drivers/llvmpipe/SConscript  |3 +-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h  |4 +-
 src/gallium/drivers/llvmpipe/lp_texture.c|  388 +++---
 src/gallium/drivers/llvmpipe/lp_texture.h|   10 -
 src/gallium/drivers/llvmpipe/lp_tile_image.c |  294 ---
 src/gallium/drivers/llvmpipe/lp_tile_image.h |   61 
 7 files changed, 50 insertions(+), 713 deletions(-)
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_tile_image.c
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_tile_image.h

diff --git a/src/gallium/drivers/llvmpipe/Makefile.am 
b/src/gallium/drivers/llvmpipe/Makefile.am
index f1ba5d1..9059053 100644
--- a/src/gallium/drivers/llvmpipe/Makefile.am
+++ b/src/gallium/drivers/llvmpipe/Makefile.am
@@ -72,8 +72,7 @@ libllvmpipe_la_SOURCES = \
lp_state_vs.c \
lp_surface.c \
lp_tex_sample.c \
-   lp_texture.c \
-   lp_tile_image.c
+   lp_texture.c
 
 libllvmpipe_la_LDFLAGS = $(LLVM_LDFLAGS)
 
diff --git a/src/gallium/drivers/llvmpipe/SConscript 
b/src/gallium/drivers/llvmpipe/SConscript
index a81cf23..22314c2 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -52,8 +52,7 @@ llvmpipe = env.ConvenienceLibrary(
'lp_state_vs.c',
'lp_surface.c',
'lp_tex_sample.c',
-   'lp_texture.c',
-   'lp_tile_image.c',
+   'lp_texture.c'
])
 
 env.Alias('llvmpipe', llvmpipe)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h 
b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 7d01da1..85febff 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -36,10 +36,12 @@
 #include lp_scene.h
 #include lp_state.h
 #include lp_texture.h
-#include lp_tile_image.h
 #include lp_limits.h
 
 
+#define TILE_VECTOR_HEIGHT 4
+#define TILE_VECTOR_WIDTH 4
+
 /* If we crash in a jitted function, we can examine jit_line and jit_state
  * to get some info.  This is not thread-safe, however.
  */
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 0804619..d10a4ce 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -46,7 +46,6 @@
 #include lp_context.h
 #include lp_flush.h
 #include lp_screen.h
-#include lp_tile_image.h
 #include lp_texture.h
 #include lp_setup.h
 #include lp_state.h
@@ -334,11 +333,6 @@ llvmpipe_resource_destroy(struct pipe_screen *pscreen,
   struct sw_winsys *winsys = screen-winsys;
   winsys-displaytarget_destroy(winsys, lpr-dt);
 
-  if (lpr-tiled_img.data) {
- align_free(lpr-tiled_img.data);
- lpr-tiled_img.data = NULL;
-  }
-
   FREE(lpr-layout[0]);
}
else if (llvmpipe_resource_is_texture(pt)) {
@@ -351,12 +345,6 @@ llvmpipe_resource_destroy(struct pipe_screen *pscreen,
  lpr-linear_img.data = NULL;
   }
 
-  /* free tiled image data */
-  if (lpr-tiled_img.data) {
- align_free(lpr-tiled_img.data);
- lpr-tiled_img.data = NULL;
-  }
-
   /* free layout flag arrays */
   for (level = 0; level  Elements(lpr-layout); level++) {
  FREE(lpr-layout[level]);
@@ -398,7 +386,6 @@ llvmpipe_resource_map(struct pipe_resource *resource,
   tex_usage == LP_TEX_USAGE_WRITE_ALL);
 
assert(layout == LP_TEX_LAYOUT_NONE ||
-  layout == LP_TEX_LAYOUT_TILED ||
   layout == LP_TEX_LAYOUT_LINEAR);
 
if (lpr-dt) {
@@ -850,27 +837,10 @@ static unsigned
 tex_image_face_size(const struct llvmpipe_resource *lpr, unsigned level,
 enum lp_texture_layout layout)
 {
-   const unsigned width = u_minify(lpr-base.width0, level);
-   const unsigned height = u_minify(lpr-base.height0, level);
-
-   assert(layout == LP_TEX_LAYOUT_TILED ||
-  layout == LP_TEX_LAYOUT_LINEAR);
+   assert(layout == LP_TEX_LAYOUT_LINEAR);
 
-   if (layout == LP_TEX_LAYOUT_TILED) {
-  /* for tiled layout, force a 32bpp format */
-  const enum pipe_format format = PIPE_FORMAT_B8G8R8A8_UNORM;
-  const unsigned block_size = util_format_get_blocksize(format);
-  const unsigned nblocksy =
- util_format_get_nblocksy(format, align(height, TILE_SIZE));
-  const unsigned nblocksx =
- util_format_get_nblocksx(format, align(width, TILE_SIZE));
-  const unsigned buffer_size = block_size * nblocksy * nblocksx;
-  return buffer_size;
-   }
-   else {
-  /* we already computed this */
-  return 

[Mesa-dev] [PATCH] llvmpipe: fix bogus handling of first_layer when setting up texture sampling

2013-05-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The code avoided first_layer parameter in the sampler interface (and needing
to do another calculation at runtime) by fixing up the base texture pointer
instead. Unfortunately, this didn't actually work as we have mip-first
texture layout so fixing up the base ptr by a fixed amount is very wrong if
there are mipmaps present. The wrong offsets caused misrendering and crashes.
Fix this by just adjusting the individual mip level offsets instead.
Spotted by Jose.
---
 src/gallium/drivers/llvmpipe/lp_setup.c |   23 ++-
 src/gallium/drivers/llvmpipe/lp_state_sampler.c |6 --
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index b5b00d1..ce9be92 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -748,23 +748,28 @@ lp_setup_set_fragment_sampler_views(struct 
lp_setup_context *setup,
  jit_tex-img_stride[j] = lp_tex-img_stride[j];
   }
 
-  /*
-   * We don't use anything like first_element (for buffers) or
-   * first_layer (for arrays), instead adjust the last_element
-   * (width) or last_layer (depth) plus the base pointer.
-   * Less parameters and faster at shader execution.
-   * XXX Could do the same for mip levels.
-   */
   if (res-target == PIPE_TEXTURE_1D_ARRAY ||
   res-target == PIPE_TEXTURE_2D_ARRAY) {
+ /*
+  * For arrays, we don't have first_layer, instead adjust
+  * last_layer (depth) plus the mip level offsets
+  * (as we have mip-first layout can't just adjust base 
ptr).
+  * XXX For mip levels, could do something similar.
+  */
  jit_tex-depth = view-u.tex.last_layer - 
view-u.tex.first_layer + 1;
- jit_tex-base = (uint8_t *)jit_tex-base +
- view-u.tex.first_layer * 
lp_tex-img_stride[0];
+ for (j = first_level; j = last_level; j++) {
+jit_tex-mip_offsets[j] += view-u.tex.first_layer *
+   lp_tex-img_stride[j];
+ }
  assert(view-u.tex.first_layer = view-u.tex.last_layer);
  assert(view-u.tex.last_layer  res-array_size);
   }
}
else {
+  /*
+   * For buffers, we don't have first_element, instead adjust
+   * last_element (width) plus the base pointer.
+   */
   unsigned view_blocksize = 
util_format_get_blocksize(view-format);
   /* probably don't really need to fill that out */
   jit_tex-mip_offsets[0] = 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 6498d13..6b7e327 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -320,8 +320,10 @@ prepare_shader_sampling(
if (res-target == PIPE_TEXTURE_1D_ARRAY ||
res-target == PIPE_TEXTURE_2D_ARRAY) {
   num_layers = view-u.tex.last_layer - 
view-u.tex.first_layer + 1;
-  addr = (uint8_t *)addr +
-  view-u.tex.first_layer * 
lp_tex-img_stride[0];
+  for (j = first_level; j = last_level; j++) {
+ mip_offsets[j] += view-u.tex.first_layer *
+   lp_tex-img_stride[j];
+  }
   assert(view-u.tex.first_layer = view-u.tex.last_layer);
   assert(view-u.tex.last_layer  res-array_size);
}
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] st/mesa: fix weird UCMP opcode use for bool ubo load

2013-05-08 Thread sroland
From: Roland Scheidegger srol...@vmware.com

I don't know what this code was trying to do but whatever it was it couldn't
have worked since negation of integer boolean inputs while not specified as
outright illegal (not yet at least) won't do anything since it doesn't affect
the result of comparison with zero at all. In fact it looks like the whole
instruction can just be omitted.
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp 
b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index f2eb3e7..08b2d7a 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1945,8 +1945,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
   if (ir-type-base_type == GLSL_TYPE_BOOL) {
  emit(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
- result_src.negate = 1;
- emit(ir, TGSI_OPCODE_UCMP, result_dst, result_src, 
st_src_reg_for_int(~0), st_src_reg_for_int(0));
   } else {
  emit(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
   }
@@ -2388,8 +2386,8 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
  if (native_integers) {
 /* This is necessary because TGSI's CMP instruction expects the
  * condition to be a float, and we store booleans as integers.
- * If TGSI had a UCMP instruction or similar, this extra
- * instruction would not be necessary.
+ * TODO: really want to avoid i2f path and use UCMP. Requires
+ * changes to process_move_condition though too.
  */
 condition_temp = get_temp(glsl_type::vec4_type);
 condition.negate = 0;
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallium/tgsi: clarify (possibly change) TGSI_OPCODE_UCMP definition

2013-05-07 Thread sroland
From: Roland Scheidegger srol...@vmware.com

UCMP while an integer opcode isn't really consistently implemented as
having all integer arguments. softpipe will assume all arguments are
ints, whereas gallivm has the arguments defined as untyped which
means they'll get treated as floats. This means input modifiers will
not work the same. Fix this by saying only first arg is an integer,
which seems more useful than making all arguments integers - this would
be similar to d3d10 movc opcode.
---
 src/gallium/docs/source/tgsi.rst |5 +
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 3af1fb7..852f8a0 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -1291,6 +1291,11 @@ Support for these opcodes indicated by 
PIPE_SHADER_CAP_INTEGERS (all of them?)
 
 .. opcode:: UCMP - Integer Conditional Move
 
+.. note::
+
+   Only the first source arg is an integer, the 2nd and 3rd ones are
+   considered floats (for input modifier purposes).
+
 .. math::
 
   dst.x = src0.x ? src1.x : src2.x
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallium: more tgsi documentation updates

2013-05-03 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Adds the remaining integer opcodes, and some opcodes are moved to more
appropriate places, along with getting rid of the (already nearly empty)
ps_2_x section. Though the CAP bits for some of these are still a bit in
the air so the documentation isn't quite as watertight as is desirable.
---
 src/gallium/docs/source/tgsi.rst |  381 +-
 1 file changed, 251 insertions(+), 130 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index b2f7a85..b479fcf 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -473,7 +473,7 @@ This instruction replicates its result.
 
 .. opcode:: KILP - Predicated Discard
 
-  discard
+  Not really predicated, just unconditional discard
 
 
 .. opcode:: PK2H - Pack Two 16-bit Floats
@@ -720,25 +720,6 @@ This instruction replicates its result.
   dst.w = round(src.w)
 
 
-.. opcode:: BRA - Branch
-
-  pc = target
-
-.. note::
-
-   Considered for removal.
-
-.. opcode:: CAL - Subroutine Call
-
-  push(pc)
-  pc = target
-
-
-.. opcode:: RET - Subroutine Call Return
-
-  pc = pop()
-
-
 .. opcode:: SSG - Set Sign
 
 .. math::
@@ -856,99 +837,6 @@ This instruction replicates its result.
   dst = texture_sample(unit, coord, lod)
 
 
-.. opcode:: BRK - Break
-
-  Unconditionally moves the point of execution to the instruction after the
-  next endloop or endswitch. The instruction must appear within a loop/endloop
-  or switch/endswitch.
-
-
-.. opcode:: BREAKC - Break Conditional
-
-  Conditionally moves the point of execution to the instruction after the
-  next endloop or endswitch. The instruction must appear within a loop/endloop
-  or switch/endswitch.
-  Condition evaluates to true if src0.x != 0 where src0.x is interpreted
-  as an integer register.
-
-
-.. opcode:: CONT - Continue
-
-  TBD
-
-.. note::
-
-   Support for CONT is determined by a special capability bit,
-   ``TGSI_CONT_SUPPORTED``. See :ref:`Screen` for more information.
-
-
-.. opcode:: IF - Float If
-
-  Start an IF ... ELSE .. ENDIF block.  Condition evaluates to true if
-
-src0.x != 0.0
-
-  where src0.x is interpreted as a floating point register.
-
-
-.. opcode:: UIF - Bitwise If
-
-  Start an UIF ... ELSE .. ENDIF block. Condition evaluates to true if
-
-src0.x != 0
-
-  where src0.x is interpreted as an integer register.
-
-
-.. opcode:: ELSE - Else
-
-  Starts an else block, after an IF or UIF statement.
-
-
-.. opcode:: ENDIF - End If
-
-  Ends an IF or UIF block.
-
-
-.. opcode:: SWITCH - Switch
-
-   Starts a C-style switch expression. The switch consists of one or multiple
-   CASE statements, and at most one DEFAULT statement. Execution of a statement
-   ends when a BRK is hit, but just like in C falling through to other cases
-   without a break is allowed. Similarly, DEFAULT label is allowed anywhere not
-   just as last statement, and fallthrough is allowed into/from it.
-   CASE src arguments are evaluated at bit level against the SWITCH src 
argument.
-
-   Example:
-   SWITCH src[0].x
-   CASE src[0].x
-   (some instructions here)
-   (optional BRK here)
-   DEFAULT
-   (some instructions here)
-   (optional BRK here)
-   CASE src[0].x
-   (some instructions here)
-   (optional BRK here)
-   ENDSWITCH
-
-
-.. opcode:: CASE - Switch case
-
-   This represents a switch case label. The src arg must be an integer 
immediate.
-
-
-.. opcode:: DEFAULT - Switch default
-
-   This represents the default case in the switch, which is taken if no other
-   case matches.
-
-
-.. opcode:: ENDSWITCH - End of switch
-
-   Ends a switch expression.
-
-
 .. opcode:: PUSHA - Push Address Register On Stack
 
   push(src.x)
@@ -980,6 +868,28 @@ This instruction replicates its result.
Considered for removal.
 
 
+.. opcode:: BRA - Branch
+
+  pc = target
+
+.. note::
+
+   Considered for removal.
+
+
+.. opcode:: CALLNZ - Subroutine Call If Not Zero
+
+   TBD
+
+.. note::
+
+   Considered for cleanup.
+
+.. note::
+
+   Considered for removal.
+
+
 Compute ISA
 
 
@@ -1380,8 +1290,6 @@ Support for these opcodes indicated by 
PIPE_SHADER_CAP_INTEGERS (all of them?)
   dst.w = src0.w  (unsigned) src1.x
 
 
-
-
 .. opcode:: UCMP - Integer Conditional Move
 
 .. math::
@@ -1395,6 +1303,115 @@ Support for these opcodes indicated by 
PIPE_SHADER_CAP_INTEGERS (all of them?)
   dst.w = src0.w ? src1.w : src2.w
 
 
+
+.. opcode:: ISSG - Integer Set Sign
+
+.. math::
+
+  dst.x = (src0.x  0) ? -1 : (src0.x  0) ? 1 : 0
+
+  dst.y = (src0.y  0) ? -1 : (src0.y  0) ? 1 : 0
+
+  dst.z = (src0.z  0) ? -1 : (src0.z  0) ? 1 : 0
+
+  dst.w = (src0.w  0) ? -1 : (src0.w  0) ? 1 : 0
+
+
+
+.. opcode:: ISLT - Signed Integer Set On Less Than
+
+.. math::
+
+  dst.x = (src0.x  src1.x) ? ~0 : 0
+
+  dst.y = (src0.y  src1.y) ? ~0 : 0
+
+  dst.z = (src0.z  src1.z) ? ~0 : 0
+
+  dst.w = (src0.w  src1.w) ? ~0 : 0
+
+
+.. opcode:: USLT - Unsigned 

[Mesa-dev] [PATCH] gallium: tgsi documentation updates and clarification for integer opcodes.

2013-05-02 Thread sroland
From: Roland Scheidegger srol...@vmware.com

A lot of them were missing. Others were moved from the Compute ISA
to a new Integer ISA section as that seemed more appropriate.
---
 src/gallium/docs/source/tgsi.rst |  362 ++
 1 file changed, 289 insertions(+), 73 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index a528fd2..b7caf63 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -872,6 +872,16 @@ This instruction replicates its result.
   as an integer register.
 
 
+.. opcode:: CONT - Continue
+
+  TBD
+
+.. note::
+
+   Support for CONT is determined by a special capability bit,
+   ``TGSI_CONT_SUPPORTED``. See :ref:`Screen` for more information.
+
+
 .. opcode:: IF - Float If
 
   Start an IF ... ELSE .. ENDIF block.  Condition evaluates to true if
@@ -977,6 +987,7 @@ These opcodes are primarily provided for special-use 
computational shaders.
 Support for these opcodes indicated by a special pipe capability bit (TBD).
 
 XXX so let's discuss it, yeah?
+XXX doesn't look like most of the opcodes really belong here.
 
 .. opcode:: CEIL - Ceiling
 
@@ -991,7 +1002,89 @@ XXX so let's discuss it, yeah?
   dst.w = \lceil src.w\rceil
 
 
-.. opcode:: I2F - Integer To Float
+.. opcode:: TRUNC - Truncate
+
+.. math::
+
+  dst.x = trunc(src.x)
+
+  dst.y = trunc(src.y)
+
+  dst.z = trunc(src.z)
+
+  dst.w = trunc(src.w)
+
+
+.. opcode:: MOD - Modulus
+
+.. math::
+
+  dst.x = src0.x \bmod src1.x
+
+  dst.y = src0.y \bmod src1.y
+
+  dst.z = src0.z \bmod src1.z
+
+  dst.w = src0.w \bmod src1.w
+
+
+.. opcode:: UARL - Integer Address Register Load
+
+  Moves the contents of the source register, assumed to be an integer, into the
+  destination register, which is assumed to be an address (ADDR) register.
+
+
+.. opcode:: SAD - Sum Of Absolute Differences
+
+.. math::
+
+  dst.x = |src0.x - src1.x| + src2.x
+
+  dst.y = |src0.y - src1.y| + src2.y
+
+  dst.z = |src0.z - src1.z| + src2.z
+
+  dst.w = |src0.w - src1.w| + src2.w
+
+
+.. opcode:: TXF - Texel Fetch (as per NV_gpu_shader4), extract a single texel
+  from a specified texture image. The source sampler may
+ not be a CUBE or SHADOW.
+  src 0 is a four-component signed integer vector used to
+ identify the single texel accessed. 3 components + level.
+ src 1 is a 3 component constant signed integer vector,
+ with each component only have a range of
+ -8..+8 (hw only seems to deal with this range, interface
+ allows for up to unsigned int).
+ TXF(uint_vec coord, int_vec offset).
+
+
+.. opcode:: TXQ - Texture Size Query (as per NV_gpu_program4)
+  retrieve the dimensions of the texture
+  depending on the target. For 1D (width), 2D/RECT/CUBE
+ (width, height), 3D (width, height, depth),
+ 1D array (width, layers), 2D array (width, height, layers)
+
+.. math::
+
+  lod = src0
+
+  dst.x = texture_width(unit, lod)
+
+  dst.y = texture_height(unit, lod)
+
+  dst.z = texture_depth(unit, lod)
+
+
+Integer ISA
+
+These opcodes are used for integer operations.
+Support for these opcodes indicated by PIPE_SHADER_CAP_INTEGERS (all of them?)
+
+
+.. opcode:: I2F - Signed Integer To Float
+
+   Rounding is unspecified (round to nearest even suggested).
 
 .. math::
 
@@ -1004,56 +1097,157 @@ XXX so let's discuss it, yeah?
   dst.w = (float) src.w
 
 
-.. opcode:: NOT - Bitwise Not
+.. opcode:: U2F - Unsigned Integer To Float
+
+   Rounding is unspecified (round to nearest even suggested).
 
 .. math::
 
-  dst.x = ~src.x
+  dst.x = (float) src.x
 
-  dst.y = ~src.y
+  dst.y = (float) src.y
 
-  dst.z = ~src.z
+  dst.z = (float) src.z
 
-  dst.w = ~src.w
+  dst.w = (float) src.w
 
 
-.. opcode:: TRUNC - Truncate
+.. opcode:: F2I - Float to Signed Integer
+
+   Rounding is towards zero (truncate).
+   Values outside signed range (including NaNs) produce undefined results.
 
 .. math::
 
-  dst.x = trunc(src.x)
+  dst.x = (int) src.x
 
-  dst.y = trunc(src.y)
+  dst.y = (int) src.y
 
-  dst.z = trunc(src.z)
+  dst.z = (int) src.z
 
-  dst.w = trunc(src.w)
+  dst.w = (int) src.w
 
 
-.. opcode:: SHL - Shift Left
+.. opcode:: F2U - Float to Unsigned Integer
+
+   Rounding is towards zero (truncate).
+   Values outside unsigned range (including NaNs) produce undefined results.
 
 .. math::
 
-  dst.x = src0.x  src1.x
+  dst.x = (unsigned) src.x
 
-  dst.y = src0.y  src1.x
+  dst.y = (unsigned) src.y
 
-  dst.z = src0.z  src1.x
+  dst.z = (unsigned) src.z
 
-  dst.w = src0.w  src1.x
+  dst.w = (unsigned) src.w
 
 
-.. opcode:: SHR - Shift Right
+.. opcode:: UADD - Integer Add
+
+   This instruction works the same for signed and unsigned integers.
+   The low 32bit of the result is returned.
 
 .. math::
 
-  dst.x = src0.x  src1.x
+ 

[Mesa-dev] [PATCH] llvmpipe: get rid of depth swizzling.

2013-04-26 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Eliminating this we no longer need to copy between linear and swizzled layout.
This is probably not quite ideal since it's a bit more work for now, could do
some optimizations by moving depth testing outside the fragment shader loop
(but tricky for early depth test as we don't have neither the mask nor the
interpolated z in the right order handy).
The large amount of tile/untile code is no longer needed will be deleted
in next commit.
Still busted though for some reason in particular everything reading/writing
depth/stencil buffer directly just fails...
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |  308 +++
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |   36 ++--
 src/gallium/drivers/llvmpipe/lp_jit.h   |4 +-
 src/gallium/drivers/llvmpipe/lp_rast.c  |  167 ++-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |  105 +
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |   56 ++---
 6 files changed, 404 insertions(+), 272 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index b9dbdc5..59556d8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -36,21 +36,13 @@
  * flushing would avoid this, but it would most likely result in depth fighting
  * artifacts.
  *
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing 
- *
- *  Z11 Z12 Z13 Z14 ...
- *  Z21 Z22 Z23 Z24 ...
- *  Z31 Z32 Z33 Z34 ...
- *  Z41 Z42 Z43 Z44 ...
- *  ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- *  ... ... ... ... ... ... ... ... ...
+ * Since we're using linear layout for everything, but we need to deal with
+ * 2x2 quads, we need to load/store multiple values and swizzle them into
+ * place (we could avoid this by doing depth/stencil testing in linear format,
+ * which would be easy for late depth/stencil test as we could do that after
+ * the fragment shader loop just as we do for color buffers, but more tricky
+ * for early depth test as we'd need both masks and interpolated depth in
+ * linear format).
  *
  *
  * @author Jose Fonseca jfons...@vmware.com
@@ -71,6 +63,7 @@
 #include gallivm/lp_bld_intr.h
 #include gallivm/lp_bld_debug.h
 #include gallivm/lp_bld_swizzle.h
+#include gallivm/lp_bld_pack.h
 
 #include lp_bld_depth.h
 
@@ -515,6 +508,210 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param loop_counter  the current loop iteration
+ * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride  stride of the depth/stencil buffer
+ */
+LLVMValueRef
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description 
*format_desc,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef loop_counter)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+   LLVMValueRef zs_dst, zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst_ptr;
+   LLVMValueRef depth_offset1, depth_offset2;
+   unsigned depth_bits = format_desc-block.bits/8;
+   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type zs_load_type = zs_type;
+   zs_load_type.length = zs_load_type.length / 2;
+
+   if (z_src_type.length == 4) {
+  unsigned i;
+  LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+  lp_build_const_int32(gallivm, 1), 
);
+  LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+  lp_build_const_int32(gallivm, 2), 
);
+  LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+  depth_stride, );
+  depth_offset1 = LLVMBuildMul(builder, looplsb,
+   lp_build_const_int32(gallivm, depth_bits * 
2), );
+  depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, );
+
+  for (i = 0; i  4; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+  }
+   }
+   else {
+  unsigned i;
+  LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), );
+  assert(z_src_type.length 

[Mesa-dev] [PATCH] llvmpipe: get rid of depth swizzling.

2013-04-26 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Eliminating this we no longer need to copy between linear and swizzled layout.
This is probably not quite ideal since it's a bit more work for now, could do
some optimizations by moving depth testing outside the fragment shader loop
(but tricky for early depth test as we don't have neither the mask nor the
interpolated z in the right order handy).
The large amount of tile/untile code is no longer needed will be deleted
in next commit.
v2: change a forgotten LAYOUT_NONE to LAYOUT_LINEAR.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |  308 +++
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |   36 ++--
 src/gallium/drivers/llvmpipe/lp_jit.h   |4 +-
 src/gallium/drivers/llvmpipe/lp_rast.c  |  167 ++-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |  105 +
 src/gallium/drivers/llvmpipe/lp_scene.c |2 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |   56 ++---
 7 files changed, 405 insertions(+), 273 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index b9dbdc5..59556d8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -36,21 +36,13 @@
  * flushing would avoid this, but it would most likely result in depth fighting
  * artifacts.
  *
- * We are free to use a different pixel layout though. Since our basic
- * processing unit is a quad (2x2 pixel block) we store the depth/stencil
- * values tiled, a quad at time. That is, a depth buffer containing 
- *
- *  Z11 Z12 Z13 Z14 ...
- *  Z21 Z22 Z23 Z24 ...
- *  Z31 Z32 Z33 Z34 ...
- *  Z41 Z42 Z43 Z44 ...
- *  ... ... ... ... ...
- *
- * will actually be stored in memory as
- *
- *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
- *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
- *  ... ... ... ... ... ... ... ... ...
+ * Since we're using linear layout for everything, but we need to deal with
+ * 2x2 quads, we need to load/store multiple values and swizzle them into
+ * place (we could avoid this by doing depth/stencil testing in linear format,
+ * which would be easy for late depth/stencil test as we could do that after
+ * the fragment shader loop just as we do for color buffers, but more tricky
+ * for early depth test as we'd need both masks and interpolated depth in
+ * linear format).
  *
  *
  * @author Jose Fonseca jfons...@vmware.com
@@ -71,6 +63,7 @@
 #include gallivm/lp_bld_intr.h
 #include gallivm/lp_bld_debug.h
 #include gallivm/lp_bld_swizzle.h
+#include gallivm/lp_bld_pack.h
 
 #include lp_bld_depth.h
 
@@ -515,6 +508,210 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param loop_counter  the current loop iteration
+ * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride  stride of the depth/stencil buffer
+ */
+LLVMValueRef
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+ struct lp_type z_src_type,
+ const struct util_format_description 
*format_desc,
+ LLVMValueRef depth_ptr,
+ LLVMValueRef depth_stride,
+ LLVMValueRef loop_counter)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+   LLVMValueRef zs_dst, zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst_ptr;
+   LLVMValueRef depth_offset1, depth_offset2;
+   unsigned depth_bits = format_desc-block.bits/8;
+   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type zs_load_type = zs_type;
+   zs_load_type.length = zs_load_type.length / 2;
+
+   if (z_src_type.length == 4) {
+  unsigned i;
+  LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+  lp_build_const_int32(gallivm, 1), 
);
+  LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+  lp_build_const_int32(gallivm, 2), 
);
+  LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+  depth_stride, );
+  depth_offset1 = LLVMBuildMul(builder, looplsb,
+   lp_build_const_int32(gallivm, depth_bits * 
2), );
+  depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, );
+
+  for (i = 0; i  4; i++) {
+ shuffles[i] = lp_build_const_int32(gallivm, i);
+  }
+   }
+   else {
+  unsigned i;
+  LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+ lp_build_const_int32(gallivm, 1), );
+  assert(z_src_type.length == 8);
+ 

[Mesa-dev] [PATCH 1/5] gallivm: increase nesting limit to 66

2013-04-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This is still not really correct, since at least for sm 4.0
the nesting limit is 64 per subroutine, and subroutine nesting itself
has a limit of 32, so since we have a flat stack we'd need 32*64.
But this should probably be better fixed with per-subroutine stacks,
since otherwise these structures get really big (like 100kB for the
lp_exec_mask).
---
 src/gallium/auxiliary/gallivm/lp_bld_limits.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h 
b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index 29bb9e3..5675e36 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -57,9 +57,11 @@
 /**
  * Maximum control flow nesting
  *
- * SM3.0 requires 24
+ * SM4.0 requires 64 (per subroutine actually, subroutine nesting itself is 32)
+ * SM3.0 requires 24 (most likely per subroutine too)
+ * add 2 more (some translation could add one more)
  */
-#define LP_MAX_TGSI_NESTING 32
+#define LP_MAX_TGSI_NESTING 66
 
 /**
  * Maximum iterations before loop termination
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/5] gallium: document breakc and switch/case/default/endswitch

2013-04-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

docs were missing, especially the opcode-from-hell switch however is anything
but obvious.
---
 src/gallium/docs/source/tgsi.rst |   57 ++
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index b7180f8..b46347e 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -861,7 +861,18 @@ This instruction replicates its result.
 
 .. opcode:: BRK - Break
 
-  TBD
+  Unconditionally moves the point of execution to the instruction after the
+  next endloop or endswitch. The instruction must appear within a loop/endloop
+  or switch/endswitch.
+
+
+.. opcode:: BREAKC - Break Conditional
+
+  Conditionally moves the point of execution to the instruction after the
+  next endloop or endswitch. The instruction must appear within a loop/endloop
+  or switch/endswitch.
+  Condition evaluates to true if src0.x != 0 where src0.x is interpreted
+  as an integer register.
 
 
 .. opcode:: IF - Float If
@@ -892,6 +903,45 @@ This instruction replicates its result.
   Ends an IF or UIF block.
 
 
+.. opcode:: SWITCH - Switch
+
+   Starts a c-style switch expression. The switch consists of one or multiple
+   CASE statements, and at most one DEFAULT statement. Execution of a statement
+   ends when a BRK is hit, but just like in C falling through to other cases
+   without a break is allowed. Similarly, DEFAULT label is allowed anywhere not
+   just as last statement, and fallthrough is allowed into/from it.
+   CASE src arguments are evaulated at bit level against the SWITCH src 
argument.
+
+   Example:
+   SWITCH src[0].x
+   CASE src[0].x
+   (some instructions here)
+   (optional BRK here)
+   DEFAULT
+   (some instructions here)
+   (optional BRK here)
+   CASE src[0].x
+   (some instructions here)
+   (optional BRK here)
+   ENDSWITCH
+
+
+.. opcode:: CASE - Switch case
+
+   This represents a switch case label. The src arg must be an integer 
immediate.
+
+
+.. opcode:: DEFAULT - Switch default
+
+   This represents the default case in the switch, which is taken if no other
+   case matches.
+
+
+.. opcode:: ENDSWITCH - End of switch
+
+   Ends a switch expression.
+
+
 .. opcode:: PUSHA - Push Address Register On Stack
 
   push(src.x)
@@ -1210,11 +1260,6 @@ XXX wait what
 
   TBD
 
-
-.. opcode:: BREAKC - Break Conditional
-
-  TBD
-
 .. _doubleopcodes:
 
 Double ISA
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/5] gallivm/tgsi: fix up breakc

2013-04-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

It seems there was a typo in gallivm breakc handling (I am actually still
not sure it is really needed but otherwise that statement really should go
away). Also fix the wrong src argument type, even though they weren't really
used.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |2 +-
 src/gallium/auxiliary/tgsi/tgsi_exec.c  |3 ++-
 src/gallium/auxiliary/tgsi/tgsi_info.c  |3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index af1b8aa..af8abb1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -221,7 +221,7 @@ static void lp_exec_break_condition(struct lp_exec_mask 
*mask,
LLVMValueRef cond_mask = LLVMBuildAnd(builder,
  mask-exec_mask,
  cond, cond_mask);
-   cond_mask = LLVMBuildNot(builder, cond, break_cond);
+   cond_mask = LLVMBuildNot(builder, cond_mask, break_cond);
 
mask-break_mask = LLVMBuildAnd(builder,
mask-break_mask,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 8b46785..75b0663 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -3239,6 +3239,7 @@ exec_case(struct tgsi_exec_machine *mach,
UPDATE_EXEC_MASK(mach);
 }
 
+/* FIXME: this will only work if default is last */
 static void
 exec_default(struct tgsi_exec_machine *mach)
 {
@@ -4200,7 +4201,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_BREAKC:
-  FETCH(r[0], 0, TGSI_CHAN_X);
+  IFETCH(r[0], 0, TGSI_CHAN_X);
   /* update CondMask */
   if (r[0].u[0]  (mach-ExecMask  0x1)) {
  mach-LoopMask = ~0x1;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c 
b/src/gallium/auxiliary/tgsi/tgsi_info.c
index aee2d30..d5db6b9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -298,6 +298,9 @@ tgsi_opcode_infer_src_type( uint opcode )
case TGSI_OPCODE_SAMPLE_I:
case TGSI_OPCODE_SAMPLE_I_MS:
case TGSI_OPCODE_UIF:
+   case TGSI_OPCODE_CASE:
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_BREAKC:
   return TGSI_TYPE_UNSIGNED;
case TGSI_OPCODE_MOD:
case TGSI_OPCODE_I2F:
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/5] gallivm: use uint build context for mask instead of float

2013-04-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Unsurprisingly noone was using it except for grabbing builder.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index af8abb1..680a9c1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -2851,7 +2851,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
 max_output_vertices);
}
 
-   lp_exec_mask_init(bld.exec_mask, bld.bld_base.base);
+   lp_exec_mask_init(bld.exec_mask, bld.bld_base.int_bld);
 
bld.system_values = *system_values;
 
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/5] gallivm: implement switch opcode

2013-04-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Should be able to handle all things which make this tricky to implement.
Fallthroughs, including most notably into/out of default, should be handled
correctly but are quite a mess.
If we see largely unoptimized switches in the wild should probably think
about some real switch optimization pass, e.g. things like this:

switch
case1
someinst
brk
case2
default
case3
someinst
brk
case4
someinst
endswitch

are legal, but the pointless case2/case3 statements not only cause condition
evaluation but will turn this into a fake fallthrough case (because mask for
case2 is already updated when default is encountered) requiring executing code
twice.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h|   17 ++
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c |2 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c|  320 +++-
 3 files changed, 327 insertions(+), 12 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 175b6a9..a3cc76e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -253,6 +253,23 @@ struct lp_exec_mask {
int cond_stack_size;
LLVMValueRef cond_mask;
 
+   boolean break_type_stack[LP_MAX_TGSI_NESTING];
+   boolean break_type;
+
+   struct {
+  LLVMValueRef switch_val;
+  LLVMValueRef switch_mask;
+  LLVMValueRef switch_mask_default;
+  boolean switch_in_default;
+  unsigned switch_pc;
+   } switch_stack[LP_MAX_TGSI_NESTING];
+   int switch_stack_size;
+   LLVMValueRef switch_val;
+   LLVMValueRef switch_mask;
+   LLVMValueRef switch_mask_default;
+   boolean switch_in_default;
+   unsigned switch_pc;
+
LLVMBasicBlockRef loop_block;
LLVMValueRef cont_mask;
LLVMValueRef break_mask;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index e1c362b..dc7c090 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -865,6 +865,8 @@ lp_set_default_actions(struct lp_build_tgsi_context * 
bld_base)
bld_base-op_actions[TGSI_OPCODE_XPD] = xpd_action;
 
bld_base-op_actions[TGSI_OPCODE_BREAKC].fetch_args = 
scalar_unary_fetch_args;
+   bld_base-op_actions[TGSI_OPCODE_SWITCH].fetch_args = 
scalar_unary_fetch_args;
+   bld_base-op_actions[TGSI_OPCODE_CASE].fetch_args = scalar_unary_fetch_args;
bld_base-op_actions[TGSI_OPCODE_COS].fetch_args = scalar_unary_fetch_args;
bld_base-op_actions[TGSI_OPCODE_EX2].fetch_args = scalar_unary_fetch_args;
bld_base-op_actions[TGSI_OPCODE_IF].fetch_args = scalar_unary_fetch_args;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 680a9c1..251b5c5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -78,9 +78,11 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, 
struct lp_build_context
mask-cond_stack_size = 0;
mask-loop_stack_size = 0;
mask-call_stack_size = 0;
+   mask-switch_stack_size = 0;
 
mask-int_vec_type = lp_build_int_vec_type(bld-gallivm, mask-bld-type);
-   mask-exec_mask = mask-ret_mask = mask-break_mask = mask-cont_mask = 
mask-cond_mask =
+   mask-exec_mask = mask-ret_mask = mask-break_mask = mask-cont_mask =
+ mask-cond_mask = mask-switch_mask =
  LLVMConstAllOnes(mask-int_vec_type);
 
mask-loop_limiter = lp_build_alloca(bld-gallivm, int_type, looplimiter);
@@ -110,6 +112,13 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
} else
   mask-exec_mask = mask-cond_mask;
 
+   if (mask-switch_stack_size) {
+  mask-exec_mask = LLVMBuildAnd(builder,
+ mask-exec_mask,
+ mask-switch_mask,
+ switchmask);
+   }
+
if (mask-call_stack_size || mask-ret_in_main) {
   mask-exec_mask = LLVMBuildAnd(builder,
  mask-exec_mask,
@@ -120,6 +129,7 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
mask-has_mask = (mask-cond_stack_size  0 ||
  mask-loop_stack_size  0 ||
  mask-call_stack_size  0 ||
+ mask-switch_stack_size  0 ||
  mask-ret_in_main);
 }
 
@@ -181,6 +191,10 @@ static void lp_exec_bgnloop(struct lp_exec_mask *mask)
 
assert(mask-loop_stack_size  LP_MAX_TGSI_NESTING);
 
+   mask-break_type_stack[mask-loop_stack_size + mask-switch_stack_size] =
+  mask-break_type;
+   mask-break_type = 0;
+
mask-loop_stack[mask-loop_stack_size].loop_block = mask-loop_block;
mask-loop_stack[mask-loop_stack_size].cont_mask = mask-cont_mask;
mask-loop_stack[mask-loop_stack_size].break_mask = mask-break_mask;
@@ -200,16 +214,51 @@ static void 

[Mesa-dev] [PATCH 1/2] gallivm: Add no_rho_opt debug option

2013-04-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This will calculate rho correctly as
sqrt(max((ds/dx)^2 + (dt/dx)^2 + (dr/dx)^2), (ds/dx)^2 + (dt/dx)^2 + (dr/dx)^2))
instead of max(|ds/dx|,|dt/dx|,|dr/dx|,|ds/dy|,|dt/dy,|dr/dy|)
(for 3 coords - 2 coords work analogous, for 1 coord there's no point doing
the exact version), for both implicit and explicit derivatives.
While such approximation seems to be allowed in OpenGL some APIs may be less
forgiving, and the error can be quite large (sqrt(2) for 2 coords, sqrt(3) for
3 coords so wrong by nearly one mip level in the latter case).
This also helps to single out real bugs from expected ones, so it is debug
only (though at least combined with no_brilinear I didn't really see much of a
performance difference but only tested with a debug build - at least with
implicit mipmaps the instruction count is almost exactly the same though the
instructions are more complex (1 sqrt and mul/adds instead of and/max mostly).
The code when the option isn't set stays exactly the same.
---
 src/gallium/auxiliary/gallivm/lp_bld_debug.h  |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_init.c   |1 +
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  299 +++--
 3 files changed, 185 insertions(+), 118 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h 
b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index da873f3..b65a1f7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -42,7 +42,8 @@
 #define GALLIVM_DEBUG_NO_OPT(1  3)
 #define GALLIVM_DEBUG_PERF  (1  4)
 #define GALLIVM_DEBUG_NO_BRILINEAR  (1  5)
-#define GALLIVM_DEBUG_GC(1  6)
+#define GALLIVM_DEBUG_NO_RHO_OPT(1  6)
+#define GALLIVM_DEBUG_GC(1  7)
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 4fa5887..6904e62 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -79,6 +79,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = {
{ nopt,   GALLIVM_DEBUG_NO_OPT, NULL },
{ perf,   GALLIVM_DEBUG_PERF, NULL },
{ no_brilinear, GALLIVM_DEBUG_NO_BRILINEAR, NULL },
+   { no_rho_opt, GALLIVM_DEBUG_NO_RHO_OPT, NULL },
{ gc, GALLIVM_DEBUG_GC, NULL },
DEBUG_NAMED_VALUE_END
 };
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index f885363..ae1e003 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -257,31 +257,59 @@ lp_build_rho(struct lp_build_sample_context *bld,
   perquadf_bld-type, rho_vec, 0);
}
else if (derivs  !(bld-static_texture_state-target == 
PIPE_TEXTURE_CUBE)) {
-  LLVMValueRef ddmax[3];
+  LLVMValueRef ddmax[3], ddx[3], ddy[3];
   for (i = 0; i  dims; i++) {
- LLVMValueRef ddx, ddy;
  LLVMValueRef floatdim;
  LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
- ddx = lp_build_abs(coord_bld, derivs-ddx[i]);
- ddy = lp_build_abs(coord_bld, derivs-ddy[i]);
- ddmax[i] = lp_build_max(coord_bld, ddx, ddy);
+
  floatdim = lp_build_extract_broadcast(gallivm, 
bld-float_size_in_type,
coord_bld-type, float_size, 
indexi);
- ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
+
+ if ((gallivm_debug  GALLIVM_DEBUG_NO_RHO_OPT)  (dims  1)) {
+ddx[i] = lp_build_mul(coord_bld, floatdim, derivs-ddx[i]);
+ddy[i] = lp_build_mul(coord_bld, floatdim, derivs-ddy[i]);
+ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
+ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
+ }
+ else {
+LLVMValueRef tmpx, tmpy;
+tmpx = lp_build_abs(coord_bld, derivs-ddx[i]);
+tmpy = lp_build_abs(coord_bld, derivs-ddy[i]);
+ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
+ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
+ }
   }
-  rho_vec = ddmax[0];
-  if (dims  1) {
- rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
+  if ((gallivm_debug  GALLIVM_DEBUG_NO_RHO_OPT)  (dims  1)) {
+ rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
+ rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
  if (dims  2) {
-rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
+rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
+rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
  }
+ rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+ rho = lp_build_pack_aos_scalars(bld-gallivm, coord_bld-type,
+ perquadf_bld-type, rho_vec, 0);
+ /*
+  * note that as long 

[Mesa-dev] [PATCH 2/2] gallivm: change cubemaps / derivatives handling, take 55

2013-04-16 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Turns out the previous fix for handling per-pixel face selection and
derivatives didn't work out that well - the derivatives were wrong by
quite a bit, in theory transformation of the derivatives into cube space
should work, but would be _a lot_ more work than the simplified transform
used.
So, for explicit derivatives, I'm just giving up and go back to not honoring
them.
For implicit derivatives (and the fake explicit ones) however we try
something a little different, we just calculate rho as we would for a 3d
texture, that is after scaling the coords by the inverse major axis.
This gives the same results as calculating the derivs after projection of
the coords to the same face as long as all pixels hit the same face (and
only without rho_no_opt, otherwise it should be a bit worse). And when
not all pixels are hitting the same face, the results aren't so hot but
not catastrophically bad (I believe not off by more than a factor of 2 without
no_rho_opt and not more than sqrt(2) with no_rho_opt). I think this is better
than just picking the wrong face but who knows...
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  212 +++--
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |8 +-
 3 files changed, 119 insertions(+), 104 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index ae1e003..154bcad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -245,16 +245,17 @@ lp_build_rho(struct lp_build_sample_context *bld,
   LLVMValueRef cubesize;
   LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
   /*
-   * If we have derivs too then we have per-pixel cube_rho - doesn't matter
-   * though until we do per-pixel lod.
* Cube map code did already everything except size mul and per-quad 
extraction.
*/
+  rho = lp_build_pack_aos_scalars(bld-gallivm, coord_bld-type,
+  perquadf_bld-type, cube_rho, 0);
+  if (gallivm_debug  GALLIVM_DEBUG_NO_RHO_OPT) {
+ rho = lp_build_sqrt(perquadf_bld, rho);
+  }
   /* Could optimize this for single quad just skip the broadcast */
   cubesize = lp_build_extract_broadcast(gallivm, bld-float_size_in_type,
-coord_bld-type, float_size, 
index0);
-  rho_vec = lp_build_mul(coord_bld, cubesize, cube_rho);
-  rho = lp_build_pack_aos_scalars(bld-gallivm, coord_bld-type,
-  perquadf_bld-type, rho_vec, 0);
+perquadf_bld-type, float_size, 
index0);
+  rho = lp_build_mul(perquadf_bld, cubesize, rho);
}
else if (derivs  !(bld-static_texture_state-target == 
PIPE_TEXTURE_CUBE)) {
   LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -1356,25 +1357,31 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
  LLVMValueRef *face,
  LLVMValueRef *face_s,
  LLVMValueRef *face_t,
- LLVMValueRef *rho)
+ LLVMValueRef *rho,
+ boolean need_derivs)
 {
struct lp_build_context *coord_bld = bld-coord_bld;
LLVMBuilderRef builder = bld-gallivm-builder;
struct gallivm_state *gallivm = bld-gallivm;
LLVMValueRef si, ti, ri;
-   boolean need_derivs = TRUE;
 
if (1 || coord_bld-type.length  4) {
   /*
* Do per-pixel face selection. We cannot however (as we used to do)
* simply calculate the derivs afterwards (which is very bogus for
-   * explicit derivs anyway) because the values would be random when
-   * not all pixels lie on the same face. Hence just transform the derivs
-   * (or rather only the dmax values), which works both for implicit and
-   * explicit derivatives and doesn't add much math (except need to
-   * calculate derivs for 3 instead of 2 coords and have a couple more 
selects
-   * but cuts some minor math elsewhere). The derivs don't need mirroring,
-   * just selection, since noone cares about the sign.
+   * explicit derivs btw) because the values would be random when
+   * not all pixels lie on the same face. So what we do here is just
+   * calculate the derivatives after scaling the coords by the absolute
+   * value of the inverse major axis, and essentially do rho calculation
+   * steps as if it were a 3d texture. This is perfect if all pixels hit
+   * the same face, but not so great at edges, I believe the max error
+   * should be sqrt(2) with no_rho_opt or 2 otherwise (essentially 
measuring
+   * the 3d distance between 2 points on the cube instead of measuring 
up/down
+   * the edge). Still this is possibly a win over just selecting the same 
face
+   

[Mesa-dev] [PATCH] gallivm: fix small but severe bug in handling multiple lod level strides

2013-04-14 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Inserting the value for the second quad in the wrong place for the
following shuffle. This meant the row or image stride was undefined which is
quite catastrophic, can lead to bogus texels fetched or just segfault.
This code is only hit for SoA path currently, still surprising it
didn't crash more or caused more visible issues (I think llvm used a
broadcast shuffle for the undefined parts of the vector, hence the undefined
value for the second quad was just the same as that from the first quad,
so as long as both quads hit the same mip level everything was fine, and since
lower mips always have the same large stride it made it less likely to
hit out-of-bound memory in case of differing lods).

Note: this is a candidate for release branches.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 7f44c4e..f885363 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -934,7 +934,7 @@ lp_build_get_level_stride_vec(struct 
lp_build_sample_context *bld,
   stride = bld-int_coord_bld.undef;
   for (i = 0; i  bld-num_lods; i++) {
  LLVMValueRef indexi = lp_build_const_int32(bld-gallivm, i);
- LLVMValueRef indexo = lp_build_const_int32(bld-gallivm, i);
+ LLVMValueRef indexo = lp_build_const_int32(bld-gallivm, 4 * i);
  indexes[1] = LLVMBuildExtractElement(builder, level, indexi, );
  stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, );
  stride1 = LLVMBuildLoad(builder, stride1, );
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: some minor cube map cleanup

2013-04-04 Thread sroland
From: Roland Scheidegger srol...@vmware.com

The ar_ge_as_at variable was just very very confusing since the condition
was actually the other way around (as_at_ge_ar). So change the condition
(and the selects depending on it) to match the variable name.
Also, while here, change the chosen major axis in case the coord values
are the same. OpenGL doesn't care one bit which one is chosen in this
case but it looks like dx10 would require z chosen over y, and y chosen
over x (previously did x chosen over y, y chosen over z). Since it's all
the same effort just honor dx10's wishes.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index fe29d25..734cfe0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1403,12 +1403,13 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
   signr = LLVMBuildAnd(builder, ri, signmask, );
 
   /*
-   * major face determination: select x if x = y else select y
-   * select previous result if y = max(x,y) else select z
+   * major face determination: select x if x  y else select y
+   * select z if z = max(x,y) else select previous result
+   * if some axis are the same we chose z over y, y over x.
*/
-  as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, as, at);
+  as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
   maxasat = lp_build_max(coord_bld, as, at);
-  ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxasat, ar);
+  ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
 
   /*
* compute all possible new s/t coords
@@ -1449,13 +1450,13 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
  dmaxtnew = lp_build_select(coord_bld, as_ge_at, dmax[1], dmax[2]);
   }
 
-  *face_s = lp_build_select(cint_bld, ar_ge_as_at, *face_s, snewz);
-  *face_t = lp_build_select(cint_bld, ar_ge_as_at, *face_t, tnewz);
-  ma = lp_build_select(coord_bld, ar_ge_as_at, ma, r);
-  *face = lp_build_select(cint_bld, ar_ge_as_at, *face, facez);
+  *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
+  *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
+  ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+  *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
   if (need_derivs) {
- dmaxsnew = lp_build_select(coord_bld, ar_ge_as_at, dmaxsnew, dmax[0]);
- dmaxtnew = lp_build_select(coord_bld, ar_ge_as_at, dmaxtnew, dmax[1]);
+ dmaxsnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[0], dmaxsnew);
+ dmaxtnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[1], dmaxtnew);
   }
 
   *face_s = LLVMBuildBitCast(builder, *face_s,
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: use f16c hw support for float-half and half-float conversion

2013-04-02 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Should be way faster of course on cpus supporting this (includes AMD
Bulldozer and Jaguar cores, Intel Ivy Bridge and up (except budget models)).
Passes piglit fbo-blending-formats GL_ARB_texture_float -auto on Ivy Bridge.
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |   45 ---
 src/gallium/auxiliary/gallivm/lp_bld_init.c |   10 ++
 src/gallium/auxiliary/util/u_cpu_detect.c   |1 +
 src/gallium/auxiliary/util/u_cpu_detect.h   |1 +
 4 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 38a577c..eb2d096 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -175,9 +175,24 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+   LLVMValueRef h;
+
+   if (util_cpu_caps.has_f16c  HAVE_LLVM = 0x0301 
+   (src_length == 4 || src_length == 8)) {
+  const char *intrinsic = NULL;
+  if (src_length == 4) {
+ src = lp_build_pad_vector(gallivm, src, 8);
+ intrinsic = llvm.x86.vcvtph2ps.128;
+  }
+  else {
+ intrinsic = llvm.x86.vcvtph2ps.256;
+  }
+  return lp_build_intrinsic_unary(builder, intrinsic,
+  lp_build_vec_type(gallivm, f32_type), 
src);
+   }
 
/* Convert int16 vector to int32 vector by zero ext (might generate bad 
code) */
-   LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, );
+   h = LLVMBuildZExt(builder, src, int_vec_type, );
return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
 }
 
@@ -204,9 +219,31 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
LLVMValueRef result;
 
-   result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, 
true);
-   /* Convert int32 vector to int16 vector by trunc (might generate bad code) 
*/
-   result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, 
i16_type), );
+   if (util_cpu_caps.has_f16c  HAVE_LLVM = 0x0301 
+   (length == 4 || length == 8)) {
+  struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
+  unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
+  LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm-context);
+  const char *intrinsic = NULL;
+  if (length == 4) {
+ intrinsic = llvm.x86.vcvtps2ph.128;
+  }
+  else {
+ intrinsic = llvm.x86.vcvtps2ph.256;
+  }
+  result = lp_build_intrinsic_binary(builder, intrinsic,
+ lp_build_vec_type(gallivm, i168_type),
+ src, LLVMConstInt(i32t, mode, 0));
+  if (length == 4) {
+ result = lp_build_extract_range(gallivm, result, 0, 4);
+  }
+   }
+
+   else {
+  result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, 
true);
+  /* Convert int32 vector to int16 vector by trunc (might generate bad 
code) */
+  result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, 
i16_type), );
+   }
 
/*
 * Debugging code.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 050eba7..4fa5887 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -468,6 +468,15 @@ lp_build_init(void)
   util_cpu_caps.has_avx = 0;
}
 
+   if (!HAVE_AVX) {
+  /*
+   * note these instructions are VEX-only, so can only emit if we use
+   * avx (don't want to base it on has_avx  has_f16c later as that would
+   * omit it unnecessarily on amd cpus, see above).
+   */
+  util_cpu_caps.has_f16c = 0;
+   }
+
 #ifdef PIPE_ARCH_PPC_64
/* Set the NJ bit in VSCR to 0 so denormalized values are handled as
 * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This garantees
@@ -495,6 +504,7 @@ lp_build_init(void)
util_cpu_caps.has_ssse3 = 0;
util_cpu_caps.has_sse4_1 = 0;
util_cpu_caps.has_avx = 0;
+   util_cpu_caps.has_f16c = 0;
 #endif
 }
 
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c 
b/src/gallium/auxiliary/util/u_cpu_detect.c
index 0328051..7e6df9d 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -279,6 +279,7 @@ util_cpu_detect(void)
  util_cpu_caps.has_sse4_1 = (regs2[2]  19)  1;
  util_cpu_caps.has_sse4_2 = (regs2[2]  20)  1;
  util_cpu_caps.has_avx= (regs2[2]  28)  1;
+ util_cpu_caps.has_f16c   = (regs2[2]  29)  1;
  util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus 
supports mmxext too 

[Mesa-dev] [PATCH 1/3] gallivm: minor rho calculation optimization for 1 or 3 coords

2013-04-02 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Using a different packing for the single coord case should save a shuffle.
Plus some minor style fixes.
---
 src/gallium/auxiliary/gallivm/lp_bld_quad.c   |   20 +++-
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   31 +++--
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c 
b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 1955add..f2a762a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -81,7 +81,8 @@ lp_build_ddy(struct lp_build_context *bld,
 /*
  * Helper for building packed ddx/ddy vector for one coord (scalar per quad
  * values). The vector will look like this (8-wide):
- * dr1dx dr1dy _ _ dr2dx dr2dy _ _
+ * dr1dx _ -dr1dy _ dr2dx _ -dr2dy _
+ * This only requires one shuffle instead of two for more straightforward 
packing.
  */
 LLVMValueRef
 lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
@@ -91,19 +92,15 @@ lp_build_packed_ddx_ddy_onecoord(struct lp_build_context 
*bld,
LLVMBuilderRef builder = gallivm-builder;
LLVMValueRef vec1, vec2;
 
-   /* same packing as _twocoord, but can use aos swizzle helper */
+   /* use aos swizzle helper */
 
-   /*
-* XXX could make swizzle1 a noop swizzle by using right top/bottom
-* pair for ddy
-*/
-   static const unsigned char swizzle1[] = {
-  LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
-  LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   static const unsigned char swizzle1[] = { /* no-op swizzle */
+  LP_BLD_QUAD_TOP_LEFT, LP_BLD_SWIZZLE_DONTCARE,
+  LP_BLD_QUAD_BOTTOM_LEFT, LP_BLD_SWIZZLE_DONTCARE
};
static const unsigned char swizzle2[] = {
-  LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
-  LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+  LP_BLD_QUAD_TOP_RIGHT, LP_BLD_SWIZZLE_DONTCARE,
+  LP_BLD_QUAD_TOP_LEFT, LP_BLD_SWIZZLE_DONTCARE
};
 
vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
@@ -120,6 +117,7 @@ lp_build_packed_ddx_ddy_onecoord(struct lp_build_context 
*bld,
  * Helper for building packed ddx/ddy vector for one coord (scalar per quad
  * values). The vector will look like this (8-wide):
  * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * This only needs 2 (v)shufps.
  */
 LLVMValueRef
 lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index fc8bae7..9a00897 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -226,7 +226,6 @@ lp_build_rho(struct lp_build_sample_context *bld,
LLVMValueRef int_size, float_size;
LLVMValueRef rho;
LLVMValueRef first_level, first_level_vec;
-   LLVMValueRef abs_ddx_ddy[2];
unsigned length = coord_bld-type.length;
unsigned num_quads = length / 4;
unsigned i;
@@ -279,32 +278,28 @@ lp_build_rho(struct lp_build_sample_context *bld,
  ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
   }
   else if (dims = 2) {
- ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld,
-   s, t);
+ ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  if (dims  2) {
 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
  }
   }
 
-  abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+  ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
   if (dims  2) {
- abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
-  }
-  else {
- abs_ddx_ddy[1] = NULL;
+ ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
   }
 
-  if (dims == 1) {
- static const unsigned char swizzle1[] = {
+  if (dims  2) {
+ static const unsigned char swizzle1[] = { /* no-op swizzle */
 0, LP_BLD_SWIZZLE_DONTCARE,
 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  };
  static const unsigned char swizzle2[] = {
-1, LP_BLD_SWIZZLE_DONTCARE,
+2, LP_BLD_SWIZZLE_DONTCARE,
 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  };
- rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
- rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+ rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
+ rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
   }
   else if (dims == 2) {
  static const unsigned char swizzle1[] = {
@@ -315,8 +310,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
 1, 3,
 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  };
- rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], 

[Mesa-dev] [PATCH 2/3] gallivm: do per-pixel cube face selection (finally!!!)

2013-04-02 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This proved to be tricky, the problem is that after selection/mirroring
we cannot calculate reasonable derivatives (if not all pixels in a quad
end up on the same face the derivatives could get randomly exceedingly
large).
However, it is actually quite easy to simply calculate the derivatives
before selection/mirroring and then transform them similar to
the cube coordinates (they only need selection/projection, but not
mirroring as we're not interested in the sign bit, of course). While
there is a tiny bit more work to do (need to calculate derivs for 3
coords instead of 2, and additional selects) it also simplifies things
somewhat for the coord selection itself (as we save some broadcast aos
shuffles, and we don't need to calculate the average vector) - hence if
derivatives aren't needed this should actually be faster.
Also, this has the benefit that this will (trivially) work for explicit
derivatives too, which we completely ignored before that (will be in a
separate commit for better trackability).
Note that while the way for getting rho looks very different, it should
result in nearly the same values as before (the nearly is only because
before the code would choose the face based on an average vector and hence
the derivatives calculated according to this face, where now (for implicit
derivatives) the derivatives are projected on the face selected for the
first (top-left) pixel in a quad, so not necessarly the same face).
The transformation done might not quite be state-of-the-art, calculating
length(dx,dy) as max(dx,dy) certainly isn't neither but this stays the
same as before (that is I think a better transform would _somehow_ take
the derivative major axis into account so that derivative changes in
the major axis wouldn't get ignored).
Should solve some accuracy problems with cubemaps (can easily be seen with
the cubemap demo when switching wrapping/filtering), though we still don't
do seamless filtering to fix it completely (so not per-sample but per-pixel
is certainly better than per-quad and already sufficient for accurate
results with nearest tex filter).

As for performance, it seems to be a tiny bit faster too (maybe 3% or so
with cubemap demo). Which I'd have expected with nearest/nearest filtering
where this will be less instructions, but the difference seems to actually
be larger with linear/linear_mipmap_linear where it is slightly more
instructions, probably the code appears less serialized allowing better
scheduling (on a sandy bridge cpu). It actually seems to be now at least
as fast as the old path using a conditional when using 128bit vectors too
(that is probably more a result of testing with a newer cpu though), for now
that old path is still there but unused.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  249 ++---
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |4 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |9 +-
 3 files changed, 180 insertions(+), 82 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 9a00897..5d50921 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -207,6 +207,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
  LLVMValueRef s,
  LLVMValueRef t,
  LLVMValueRef r,
+ LLVMValueRef cube_rho,
  const struct lp_derivatives *derivs)
 {
struct gallivm_state *gallivm = bld-gallivm;
@@ -240,8 +241,22 @@ lp_build_rho(struct lp_build_sample_context *bld,
int_size = lp_build_minify(int_size_bld, bld-int_size, first_level_vec);
float_size = lp_build_int_to_float(float_size_bld, int_size);
 
-   /* XXX ignoring explicit derivs for cube maps for now */
-   if (derivs  !(bld-static_texture_state-target == PIPE_TEXTURE_CUBE)) {
+   if (cube_rho) {
+  LLVMValueRef cubesize;
+  LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+  /*
+   * If we have derivs too then we have per-pixel cube_rho - doesn't matter
+   * though until we do per-pixel lod.
+   * Cube map code did already everything except size mul and per-quad 
extraction.
+   */
+  /* Could optimize this for single quad just skip the broadcast */
+  cubesize = lp_build_extract_broadcast(gallivm, bld-float_size_in_type,
+coord_bld-type, float_size, 
index0);
+  rho_vec = lp_build_mul(coord_bld, cubesize, cube_rho);
+  rho = lp_build_pack_aos_scalars(bld-gallivm, coord_bld-type,
+  perquadf_bld-type, rho_vec, 0);
+   }
+   else if (derivs  !(bld-static_texture_state-target == 
PIPE_TEXTURE_CUBE)) {
   LLVMValueRef ddmax[3];
   for (i = 0; i  dims; i++) {
  LLVMValueRef ddx, ddy;
@@ -561,6 +576,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,

[Mesa-dev] [PATCH 3/3] gallivm: honor explicit derivatives values for cube maps.

2013-04-02 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This is trivial now, though need to make sure we pass all the necessary
derivative values (which is 3 each for ddx/ddy not 2).
Untested (no piglit test) however since the transform works the same
as implicit derivatives this should probably work correctly.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   10 ++--
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |1 +
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |2 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c   |   66 ++---
 4 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 5d50921..cc04a70 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1287,6 +1287,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
  LLVMValueRef s,
  LLVMValueRef t,
  LLVMValueRef r,
+ const struct lp_derivatives *derivs, /* optional */
  LLVMValueRef *face,
  LLVMValueRef *face_s,
  LLVMValueRef *face_t,
@@ -1296,7 +1297,6 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
LLVMBuilderRef builder = bld-gallivm-builder;
struct gallivm_state *gallivm = bld-gallivm;
LLVMValueRef si, ti, ri;
-   boolean implicit_derivs = TRUE;
boolean need_derivs = TRUE;
 
if (1 || coord_bld-type.length  4) {
@@ -1334,9 +1334,9 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
   assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
 
   /*
-   * TODO do this only when needed, and implement explicit derivs 
(trivial).
+   * TODO do this only when needed.
*/
-  if (need_derivs  implicit_derivs) {
+  if (need_derivs  !derivs) {
  LLVMValueRef ddx_ddy[2], tmp[2];
  /*
   * This isn't quite the same as the ordinary path since
@@ -1374,9 +1374,9 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
  dmax[2] = lp_build_max(coord_bld, tmp[0], tmp[1]);
   }
   else if (need_derivs) {
- /* dmax[0] = lp_build_max(coord_bld, derivs-ddx[0], derivs-ddy[0]);
+ dmax[0] = lp_build_max(coord_bld, derivs-ddx[0], derivs-ddy[0]);
  dmax[1] = lp_build_max(coord_bld, derivs-ddx[1], derivs-ddy[1]);
- dmax[2] = lp_build_max(coord_bld, derivs-ddx[2], derivs-ddy[2]); */
+ dmax[2] = lp_build_max(coord_bld, derivs-ddx[2], derivs-ddy[2]);
   }
 
   si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), 
);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 5026b0a..72af813 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -433,6 +433,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
  LLVMValueRef s,
  LLVMValueRef t,
  LLVMValueRef r,
+ const struct lp_derivatives *derivs, /* optional */
  LLVMValueRef *face,
  LLVMValueRef *face_s,
  LLVMValueRef *face_t,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 3b950ea..d2cc0f3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1102,7 +1102,7 @@ lp_build_sample_common(struct lp_build_sample_context 
*bld,
 */
if (target == PIPE_TEXTURE_CUBE) {
   LLVMValueRef face, face_s, face_t;
-  lp_build_cube_lookup(bld, *s, *t, *r, face, face_s, face_t, 
cube_rho);
+  lp_build_cube_lookup(bld, *s, *t, *r, derivs, face, face_s, face_t, 
cube_rho);
   *s = face_s; /* vec */
   *t = face_t; /* vec */
   /* use 'r' to indicate cube face */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index facfc82..007e3c9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1276,8 +1276,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
LLVMValueRef offsets[3] = { NULL };
struct lp_derivatives derivs;
struct lp_derivatives *deriv_ptr = NULL;
-   unsigned num_coords;
-   unsigned dims;
+   unsigned num_coords, num_derivs, num_offsets;
unsigned i;
 
if (!bld-sampler) {
@@ -1291,37 +1290,52 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
switch (inst-Texture.Texture) {
case TGSI_TEXTURE_1D:
   num_coords = 1;
-  dims = 1;
+  num_offsets = 1;
+  num_derivs = 1;
   break;
case TGSI_TEXTURE_1D_ARRAY:
   num_coords = 2;
-  dims = 1;
+  num_offsets = 1;
+  num_derivs = 1;
   

[Mesa-dev] [PATCH 1/2] gallivm: consolidate code for float-to-half and float-to-packed conversion.

2013-03-29 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This replaces the existing float-to-half implementation.
There are definitely a couple of differences - the old implementation
had unspecified(?) rounding behavior, and could at least in theory
construct Inf values out of NaNs. NaNs and Infs should now always be
properly propagated, and rounding behavior is now towards zero
(note this means too large but non-Infinity values get propagated to max
representable value, not Infinity).
The implementation will definitely not match util code, however (which
does nearest rounding, which also means too large values will get
propagated to Infinity).

Also fix a bogus round mask probably leading to rounding bugs...
v2: fix a logic bug in handling infs/nans.
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c|   66 ++
 src/gallium/auxiliary/gallivm/lp_bld_format.h  |9 ++
 .../auxiliary/gallivm/lp_bld_format_float.c|  135 
 3 files changed, 102 insertions(+), 108 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 43c59f3..38a577c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -176,7 +176,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
 
-   /* Convert int16 vector to int32 vector by zero ext */
+   /* Convert int16 vector to int32 vector by zero ext (might generate bad 
code) */
LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, );
return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
 }
@@ -184,16 +184,13 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
 
 /**
  * Converts float32 to int16 half-float
- * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i 
think?)
+ * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
  * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
  *
  * @param src   value to convert
  *
- * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
- * ref https://gist.github.com/2156668
- *
- * XXX: This is an approximation. It is faster but certain NaNs are converted 
to
- * infinity, and rounding is not correct.
+ * Convert float32 to half floats, preserving Infs and NaNs,
+ * with rounding towards zero (trunc).
  */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@@ -203,60 +200,13 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
? LLVMGetVectorSize(f32_vec_type) : 1;
-   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
-   struct lp_type u32_type = lp_type_uint_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
-   LLVMTypeRef u32_vec_type = lp_build_vec_type(gallivm, u32_type);
-   LLVMTypeRef i16_vec_type = lp_build_vec_type(gallivm, i16_type);
-   struct lp_build_context f32_bld;
-   struct lp_build_context u32_bld;
LLVMValueRef result;
 
-   lp_build_context_init(f32_bld, gallivm, f32_type);
-   lp_build_context_init(u32_bld, gallivm, u32_type);
-
-   {
-  /* Constants */
-  LLVMValueRef u32_f32inf= lp_build_const_int_vec(gallivm, u32_type, 
0xff  23);
-  LLVMValueRef u32_expinf= lp_build_const_int_vec(gallivm, u32_type, 
0xe0  23);
-  LLVMValueRef f32_f16max= lp_build_const_vec(gallivm, f32_type, 
65536.0); // 0x8f  23
-  LLVMValueRef f32_magic = lp_build_const_vec(gallivm, f32_type, 
1.92592994e-34); // 0x0f  23
-
-  /* Cast from float32 to int32 */
-  LLVMValueRef f = LLVMBuildBitCast(builder, src, 
u32_vec_type, );
-
-  /* Remove sign */
-  LLVMValueRef srcabs = lp_build_abs(f32_bld, src);
-  LLVMValueRef fabs   = LLVMBuildBitCast(builder, srcabs, 
u32_vec_type, );
-
-  /* Magic conversion */
-  LLVMValueRef clamped   = lp_build_min(f32_bld, f32_f16max, srcabs);
-  LLVMValueRef scaled= LLVMBuildBitCast(builder,
-LLVMBuildFMul(builder,
-  clamped,
-  f32_magic,
-  ),
-u32_vec_type,
-);
-  /* Make sure Inf/NaN and unormalised survive */
-  LLVMValueRef infnancase= LLVMBuildXor(builder, u32_expinf, fabs, );
-  LLVMValueRef b_notnormal   = lp_build_compare(gallivm, f32_type, 
PIPE_FUNC_GEQUAL,
-   

[Mesa-dev] [PATCH 2/2] gallivm: bring back optimized but incorrect float to smallfloat optimizations

2013-03-29 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Conceptually the same as previously done in float_to_half.
Should cut down number of instructions from 14 to 10 or so, but
will promote some NaNs to Infs, so it's disabled.
It gets a bit tricky though handling all the cases correctly...
Passes basic tests either way (though there are no tests testing special
cases, but some manual tests injecting them seemed promising).
---
 .../auxiliary/gallivm/lp_bld_format_float.c|  124 ++--
 1 file changed, 86 insertions(+), 38 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
index 161e392..61b6a60 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_float.c
@@ -79,13 +79,15 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm,
 {
LLVMBuilderRef builder = gallivm-builder;
LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
-   LLVMValueRef rescale_src, tmp, i32_roundmask, small_max;
-   LLVMValueRef is_nan, i32_qnanbit, src_abs, shift, infcheck_src, res;
-   LLVMValueRef is_inf, is_nan_or_inf, nan_or_inf, mask;
+   LLVMValueRef rescale_src, i32_roundmask, small_max;
+   LLVMValueRef i32_qnanbit, shift, res;
+   LLVMValueRef is_nan_or_inf, nan_or_inf, mask, srci;
struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
struct lp_build_context f32_bld, i32_bld;
LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
unsigned exponent_start = mantissa_start + mantissa_bits;
+   boolean always_preserve_nans = true;
+   boolean maybe_correct_denorm_rounding = true;
 
lp_build_context_init(f32_bld, gallivm, f32_type);
lp_build_context_init(i32_bld, gallivm, i32_type);
@@ -94,35 +96,41 @@ lp_build_float_to_smallfloat(struct gallivm_state *gallivm,
  ((1  exponent_bits) - 1)  23);
i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
 
-   src_abs = lp_build_abs(f32_bld, src);
-   src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, );
+   srci = LLVMBuildBitCast(builder, src, i32_bld.vec_type, );
 
if (has_sign) {
-  rescale_src = src_abs;
-  infcheck_src = src_abs;
-  src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, );
+  rescale_src = src;
}
else {
   /* clamp to pos range (can still have sign bit if NaN or negative zero) 
*/
-  rescale_src = lp_build_max(f32_bld, src, zero);
-  rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, 
);
-  src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, );
-  infcheck_src = src;
+  rescale_src = lp_build_max(f32_bld, zero, src);
}
+   rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, );
 
/* ordinary number */
-   /* get rid of excess mantissa bits, and while here also potential sign bit 
*/
-   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
-  ~((1  (23 - mantissa_bits)) - 1) 
-  0x7fff);
+   /*
+* get rid of excess mantissa bits and sign bit
+* This is only really needed for correct rounding of denorms I think
+* but only if we use the preserve NaN path does using
+* src_abs instead save us any instruction.
+*/
+   if (maybe_correct_denorm_rounding || !always_preserve_nans) {
+  i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
+ ~((1  (23 - mantissa_bits)) - 
1) 
+ 0x7fff);
+  rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, 
);
+  rescale_src = lp_build_and(i32_bld, rescale_src, i32_roundmask);
+  rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, 
);
+   }
+   else {
+  rescale_src = lp_build_abs(f32_bld, src);
+   }
 
-   tmp = lp_build_and(i32_bld, rescale_src, i32_roundmask);
-   tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, );
/* bias exponent (and denormalize if necessary) */
magic = lp_build_const_int_vec(gallivm, i32_type,
   ((1  (exponent_bits - 1)) - 1)  23);
magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, );
-   normal = lp_build_mul(f32_bld, tmp, magic);
+   normal = lp_build_mul(f32_bld, rescale_src, magic);
 
/* clamp to max value - largest non-infinity number */
small_max = lp_build_const_int_vec(gallivm, i32_type,
@@ -141,19 +149,66 @@ lp_build_float_to_smallfloat(struct gallivm_state 
*gallivm,
 * (Cannot actually save the comparison since we need to distinguish
 * Inf and NaN cases anyway, but it would be better for AVX.)
 */
-   is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
- src_abs, i32_floatexpmask);
-   is_inf = lp_build_compare(gallivm, 

[Mesa-dev] [PATCH] gallivm: consolidate some half-to-float and r11g11b10-to-float code

2013-03-24 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Similar enough that we can try to use shared code.
As far as I can tell this should also fix an issue with negative
values for half-to-float conversion (not noticed in tests).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c|   46 +++-
 src/gallium/auxiliary/gallivm/lp_bld_format.h  |9 +++
 .../auxiliary/gallivm/lp_bld_format_float.c|   58 
 3 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649d..43c59f3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -75,6 +75,7 @@
 #include lp_bld_logic.h
 #include lp_bld_intr.h
 #include lp_bld_printf.h
+#include lp_bld_format.h
 
 
 
@@ -156,61 +157,28 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 /**
  * Converts int16 half-float to float32
- * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i 
think?)
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
  *
  * @param src   value to convert
  *
- * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
- * ref https://gist.github.com/2144712
  */
 LLVMValueRef
 lp_build_half_to_float(struct gallivm_state *gallivm,
LLVMValueRef src)
 {
-   int src_length = LLVMGetVectorSize(LLVMTypeOf(src));
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+LLVMGetVectorSize(src_type) : 1;
 
struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
-
-   LLVMBuilderRef builder = gallivm-builder;
LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
-   LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
-
-   /* Constants */
-   LLVMValueRef i32_13  = lp_build_const_int_vec(gallivm, i32_type, 
13);
-   LLVMValueRef i32_16  = lp_build_const_int_vec(gallivm, i32_type, 
16);
-   LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 
0x7fff);
-   LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 
0x7bff);
-   LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 
0xff  23);
-   LLVMValueRef f32_magic   = LLVMBuildBitCast(builder,
-   
lp_build_const_int_vec(gallivm, i32_type, (254 - 15)  23),
-   float_vec_type, );
 
/* Convert int16 vector to int32 vector by zero ext */
LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, );
-
-   /* Exponent / mantissa bits */
-   LLVMValueRef expmant   = LLVMBuildAnd(builder, i32_mask_nosign, h, );
-   LLVMValueRef shifted   = LLVMBuildBitCast(builder, 
LLVMBuildShl(builder, expmant, i32_13, ), float_vec_type, );
-
-   /* Exponent adjust */
-   LLVMValueRef scaled= LLVMBuildBitCast(builder, 
LLVMBuildFMul(builder, shifted, f32_magic, ), int_vec_type, );
-
-   /* Make sure Inf/NaN survive */
-   LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, 
PIPE_FUNC_GREATER, expmant, i32_was_infnan);
-   LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, 
i32_exp_infnan, );
-
-   /* Sign bit */
-   LLVMValueRef justsign  = LLVMBuildXor(builder, h, expmant, );
-   LLVMValueRef sign  = LLVMBuildShl(builder, justsign, i32_16, );
-
-   /* Combine result */
-   LLVMValueRef sign_inf  = LLVMBuildOr(builder, sign, infnanexp, );
-   LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, );
-
-   /* Cast from int32 vector to float32 vector */
-   return LLVMBuildBitCast(builder, final, float_vec_type, );
+   return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h 
b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index f9ddc68..aa8c729 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -127,6 +127,15 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state 
*gallivm,
  */
 
 LLVMValueRef
+lp_build_smallfloat_to_float(struct gallivm_state *gallivm,
+ struct lp_type f32_type,
+ LLVMValueRef src,
+ unsigned mantissa_bits,
+ unsigned exponent_bits,
+ unsigned mantissa_start,
+ boolean has_sign);
+
+LLVMValueRef
 lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
 LLVMValueRef *src);
 
diff --git 

[Mesa-dev] [PATCH] gallivm: consolidate code for float-to-half and float-to-packed conversion.

2013-03-24 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This replaces the existing float-to-half implementation.
There are definitely a couple of differences - the old implementation
had unspecified(?) rounding behavior, and could at least in theory
construct Inf values out of NaNs. NaNs and Infs should now always be
properly propagated, and rounding behavior is now towards zero
(note this means too large but non-Infinity values get propagated to max
representable value, not Infinity).
The implementation will definitely not match util code, however (which
does nearest rounding, which also means too large values will get
propagated to Infinity).

Also fix a bogus round mask probably leading to rounding bugs...
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c|   66 ++
 src/gallium/auxiliary/gallivm/lp_bld_format.h  |9 ++
 .../auxiliary/gallivm/lp_bld_format_float.c|  135 
 3 files changed, 102 insertions(+), 108 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 43c59f3..38a577c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -176,7 +176,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
 
-   /* Convert int16 vector to int32 vector by zero ext */
+   /* Convert int16 vector to int32 vector by zero ext (might generate bad 
code) */
LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, );
return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
 }
@@ -184,16 +184,13 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
 
 /**
  * Converts float32 to int16 half-float
- * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i 
think?)
+ * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
  * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
  *
  * @param src   value to convert
  *
- * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
- * ref https://gist.github.com/2156668
- *
- * XXX: This is an approximation. It is faster but certain NaNs are converted 
to
- * infinity, and rounding is not correct.
+ * Convert float32 to half floats, preserving Infs and NaNs,
+ * with rounding towards zero (trunc).
  */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@@ -203,60 +200,13 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
? LLVMGetVectorSize(f32_vec_type) : 1;
-   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
-   struct lp_type u32_type = lp_type_uint_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
-   LLVMTypeRef u32_vec_type = lp_build_vec_type(gallivm, u32_type);
-   LLVMTypeRef i16_vec_type = lp_build_vec_type(gallivm, i16_type);
-   struct lp_build_context f32_bld;
-   struct lp_build_context u32_bld;
LLVMValueRef result;
 
-   lp_build_context_init(f32_bld, gallivm, f32_type);
-   lp_build_context_init(u32_bld, gallivm, u32_type);
-
-   {
-  /* Constants */
-  LLVMValueRef u32_f32inf= lp_build_const_int_vec(gallivm, u32_type, 
0xff  23);
-  LLVMValueRef u32_expinf= lp_build_const_int_vec(gallivm, u32_type, 
0xe0  23);
-  LLVMValueRef f32_f16max= lp_build_const_vec(gallivm, f32_type, 
65536.0); // 0x8f  23
-  LLVMValueRef f32_magic = lp_build_const_vec(gallivm, f32_type, 
1.92592994e-34); // 0x0f  23
-
-  /* Cast from float32 to int32 */
-  LLVMValueRef f = LLVMBuildBitCast(builder, src, 
u32_vec_type, );
-
-  /* Remove sign */
-  LLVMValueRef srcabs = lp_build_abs(f32_bld, src);
-  LLVMValueRef fabs   = LLVMBuildBitCast(builder, srcabs, 
u32_vec_type, );
-
-  /* Magic conversion */
-  LLVMValueRef clamped   = lp_build_min(f32_bld, f32_f16max, srcabs);
-  LLVMValueRef scaled= LLVMBuildBitCast(builder,
-LLVMBuildFMul(builder,
-  clamped,
-  f32_magic,
-  ),
-u32_vec_type,
-);
-  /* Make sure Inf/NaN and unormalised survive */
-  LLVMValueRef infnancase= LLVMBuildXor(builder, u32_expinf, fabs, );
-  LLVMValueRef b_notnormal   = lp_build_compare(gallivm, f32_type, 
PIPE_FUNC_GEQUAL,
-srcabs,
- 

[Mesa-dev] [PATCH] gallivm: move code for dealing with rgb9e5 and r11g11b10 formats to own file

2013-03-23 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This is really not generic conversion stuff and the code very particular to
these formats.
---
 src/gallium/auxiliary/Makefile.sources |1 +
 src/gallium/auxiliary/gallivm/lp_bld_conv.c|  329 -
 src/gallium/auxiliary/gallivm/lp_bld_conv.h|   14 -
 src/gallium/auxiliary/gallivm/lp_bld_format.h  |   18 +
 .../auxiliary/gallivm/lp_bld_format_float.c|  373 
 5 files changed, 392 insertions(+), 343 deletions(-)
 create mode 100644 src/gallium/auxiliary/gallivm/lp_bld_format_float.c

diff --git a/src/gallium/auxiliary/Makefile.sources 
b/src/gallium/auxiliary/Makefile.sources
index 74c7902..898abe0 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -164,6 +164,7 @@ GALLIVM_SOURCES := \
 gallivm/lp_bld_flow.c \
 gallivm/lp_bld_format_aos.c \
 gallivm/lp_bld_format_aos_array.c \
+   gallivm/lp_bld_format_float.c \
 gallivm/lp_bld_format_soa.c \
 gallivm/lp_bld_format_yuv.c \
 gallivm/lp_bld_gather.c \
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 2f39abc..dc3649d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -155,335 +155,6 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 
 /**
- * Convert float32 to a float-like value with less exponent and mantissa
- * bits. The mantissa is still biased, and the mantissa still has an implied 1,
- * but there's no sign bit.
- *
- * @param src (vector) float value to convert
- * @param mantissa_bits   the number of mantissa bits
- * @param exponent_bits   the number of exponent bits
- *
- * Unlike float_to_half using accurate method here.
- * This implements round-towards-zero (trunc) hence too large numbers get
- * converted to largest representable number, not infinity.
- * Small numbers may get converted to denorms, depending on normal
- * float denorm handling of the cpu.
- * Note that compared to the references, below, we skip any rounding bias
- * since we do rounding towards zero - OpenGL allows rounding towards zero
- * (though not preferred) and DX10 even seems to require it.
- * Note that this will not do any packing - the value will
- * look like a rescaled float (except for Inf/NaN) but be returned
- * as int32.
- *
- * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
- * ref https://gist.github.com/rygorous/2156668
- */
-static LLVMValueRef
-lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
-struct lp_type i32_type,
-LLVMValueRef src,
-unsigned mantissa_bits,
-unsigned exponent_bits)
-{
-   LLVMBuilderRef builder = gallivm-builder;
-   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
-   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
-   LLVMValueRef is_nan, is_posinf, is_nan_or_posinf, i32_qnanbit, 
nan_or_posinf;
-   struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
-   struct lp_build_context f32_bld, i32_bld;
-   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
-
-   lp_build_context_init(f32_bld, gallivm, f32_type);
-   lp_build_context_init(i32_bld, gallivm, i32_type);
-
-   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
- ((1  exponent_bits) - 1)  23);
-   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
-
-   /* ordinary number */
-   /* clamp to pos range (can still have sign bit if NaN or negative zero) */
-   clamped = lp_build_max(f32_bld, src, zero);
-   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, );
-   /* get rid of excess mantissa bits, and while here also potential sign bit 
*/
-   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
-  ~((1  (23 - mantissa_bits)) - 1) |
-  0x7fff);
-
-   tmp = lp_build_and(i32_bld, clamped, i32_roundmask);
-   tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, );
-   /* bias exponent (and denormalize if necessary) */
-   magic = lp_build_const_int_vec(gallivm, i32_type,
-  ((1  (exponent_bits - 1)) - 1)  23);
-   magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, );
-   normal = lp_build_mul(f32_bld, tmp, magic);
-
-   /* clamp to max value */
-   small_max = lp_build_const_int_vec(gallivm, i32_type,
-  (((1  exponent_bits) - 2)  23) |
-  (((1  mantissa_bits) - 1)  (23 - 
mantissa_bits)));
-   small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, );
-   normal = lp_build_min(f32_bld, normal, 

[Mesa-dev] [PATCH] gallivm: Add code for rgb9e5 shared exponent format to float conversion

2013-03-22 Thread sroland
From: Roland Scheidegger srol...@vmware.com

And use this (and the code for r11g11b10 packed float to float conversion)
in the soa texturing code (the generated code looks quite good).
Should be an order of magnitude faster probably than using the fallback
(not measured).
Tested with piglit texwrap GL_EXT_packed_float and
GL_EXT_texture_shared_exponent respectively (didn't find much else using
it).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c   |   86 +
 src/gallium/auxiliary/gallivm/lp_bld_conv.h   |5 ++
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c |   30 ++-
 3 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 053f413..2f39abc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -397,6 +397,92 @@ lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
 }
 
 
+static LLVMValueRef
+lp_build_rgb9_to_float_helper(struct gallivm_state *gallivm,
+  struct lp_type f32_type,
+  LLVMValueRef src,
+  LLVMValueRef scale,
+  unsigned mantissa_start)
+{
+   LLVMValueRef shift, mask;
+
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
+   struct lp_build_context i32_bld, f32_bld;
+
+   lp_build_context_init(i32_bld, gallivm, i32_type);
+   lp_build_context_init(f32_bld, gallivm, f32_type);
+
+   /*
+* This is much easier as other weirdo float formats, since
+* there's no sign, no Inf/NaN, and there's nothing special
+* required for normals/denormals neither (as without the implied one
+* for the mantissa for other formats, everything looks like a denormal).
+* So just do (float)comp_bits * scale
+*/
+   shift = lp_build_const_int_vec(gallivm, i32_type, mantissa_start);
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff);
+   src = lp_build_shr(i32_bld, src, shift);
+   src = lp_build_and(i32_bld, src, mask);
+   src = lp_build_int_to_float(f32_bld, src);
+   return lp_build_mul(f32_bld, src, scale);
+}
+
+
+/**
+ * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
+ *
+ * @param src   packed AoS rgb9e5 values (as (vector) int32)
+ * @param dst   pointer to the SoA result values
+ */
+void
+lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
+ LLVMValueRef src,
+ LLVMValueRef *dst)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef shift, scale, bias, exp;
+   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+LLVMGetVectorSize(src_type) : 1;
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
+   struct lp_type u32_type = lp_type_uint_vec(32, 32 * src_length);
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
+   struct lp_build_context i32_bld, u32_bld, f32_bld;
+
+   lp_build_context_init(i32_bld, gallivm, i32_type);
+   lp_build_context_init(u32_bld, gallivm, u32_type);
+   lp_build_context_init(f32_bld, gallivm, f32_type);
+
+   /* extract exponent */
+   shift = lp_build_const_int_vec(gallivm, i32_type, 27);
+   /* this shift needs to be unsigned otherwise need mask */
+   exp = lp_build_shr(u32_bld, src, shift);
+
+   /*
+* scale factor is 2 ^ (exp - bias)
+* (and additionally corrected here for the mantissa bits)
+* not using shift because
+* a) don't have vector shift in a lot of cases
+* b) shift direction changes hence need 2 shifts + conditional
+*(or rotate instruction which is even more rare (for instance XOP))
+* so use whacky float 2 ^ function instead manipulating exponent
+* (saves us the float conversion at the end too)
+*/
+   bias = lp_build_const_int_vec(gallivm, i32_type, 127 - (15 + 9));
+   scale = lp_build_add(i32_bld, exp, bias);
+   shift = lp_build_const_int_vec(gallivm, i32_type, 23);
+   scale = lp_build_shl(i32_bld, scale, shift);
+   scale = LLVMBuildBitCast(builder, scale, f32_bld.vec_type, );
+
+   dst[0] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 0);
+   dst[1] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 9);
+   dst[2] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 18);
+
+   /* Just set alpha to one */
+   dst[3] = f32_bld.one;
+}
+
+
 /**
  * Converts int16 half-float to float32
  * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i 
think?)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index 5bd6f4f..d8bc294 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -70,6 +70,11 @@ lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
   

[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

2013-03-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

New conversion code to handle conversion from/to r11g11b10 AoS to/from
SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
(which works pretty much the same as r11g11b10 except for the packing).
(This code should also be used for texture sampling instead of
relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
Unfortunately a crazy amount of hacks is necessary to get the conversion
code running in llvmpipe's generate_unswizzled_blend, which isn't well
suited for formats where the storage representation has nothing to do
with what's needed for blending (moreover, the conversion will convert
from packed AoS values, which is the storage format, to float SoA values,
because this is much more natural for the conversion, and likewise from
SoA values to packed AoS values - but the blend (which includes
trivial things like partial mask) works on AoS values, so incoming fs
values will go SoA-AoS, values from destination will go packed
AoS-SoA-AoS, then do blend, then AoS-SoA-packed AoS which probably
isn't the most efficient way though the shuffles are probably bearable).

Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
still need to verify Inf/NaNs (where most of the complexity in the
conversion comes from actually).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |  314 +++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h |   14 ++
 src/gallium/drivers/llvmpipe/lp_screen.c|6 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 +++
 4 files changed, 458 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649d..4fce1bc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -155,6 +155,320 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 
 /**
+ * Convert float32 to a float-like value with less exponent and mantissa
+ * bits. The mantissa is still biased, and the mantissa still has an implied 1,
+ * but there's no sign bit.
+ *
+ * @param src (vector) float value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ *
+ * Unlike float_to_half using accurate method here.
+ * This implements round-towards-zero (trunc) hence too large numbers get
+ * converted to largest representable number, not infinity.
+ * Small numbers may get converted to denorms, depending on normal
+ * float denorm handling of the cpu.
+ * Note that compared to the references, below, we skip any rounding bias
+ * and do strict rounding towards zero (if I got the constants right...)
+ * - OpenGL allows rounding towards zero (though not preferred) and
+ * DX10 even seems to require it.
+ * Note that this will not try to pack the values somehow - they will
+ * look like rescaled floats (except for Inf/NaN) (but returned as
+ * (vector) int32).
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
+LLVMValueRef src,
+unsigned mantissa_bits,
+unsigned exponent_bits)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
+   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
+   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
+   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
+   struct lp_build_context f32_bld, i32_bld;
+   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
+
+   lp_build_context_init(f32_bld, gallivm, f32_type);
+   lp_build_context_init(i32_bld, gallivm, i32_type);
+
+   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+ ((1  exponent_bits) - 1)  23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
+
+   /* ordinary number */
+   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
+   clamped = lp_build_max(f32_bld, src, zero);
+   clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, );
+   /* get rid of excess mantissa bits */
+   /* really not sure about that constant */
+   i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
+  ~((1  (23 - mantissa_bits)) - 1));
+
+   tmp = lp_build_and(i32_bld, clamped, i32_roundmask);
+   tmp = LLVMBuildBitCast(builder, tmp, 

[Mesa-dev] [PATCH] llvmpipe: add EXT_packed_float render target format support

2013-03-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

New conversion code to handle conversion from/to r11g11b10 AoS to/from
SoA floats, and also add code for conversion from rgb9e5 AoS to float SoA
(which works pretty much the same as r11g11b10 except for the packing).
(This code should also be used for texture sampling instead of
relying on u_format conversion but it's not yet, so rgb9e5 is unused.)
Unfortunately a crazy amount of hacks is necessary to get the conversion
code running in llvmpipe's generate_unswizzled_blend, which isn't well
suited for formats where the storage representation has nothing to do
with what's needed for blending (moreover, the conversion will convert
from packed AoS values, which is the storage format, to float SoA values,
because this is much more natural for the conversion, and likewise from
SoA values to packed AoS values - but the blend (which includes
trivial things like partial mask) works on AoS values, so incoming fs
values will go SoA-AoS, values from destination will go packed
AoS-SoA-AoS, then do blend, then AoS-SoA-packed AoS which probably
isn't the most efficient way though the shuffles are probably bearable).

Passes piglit fbo-blending-formats (with GL_EXT_packed_float parameter),
still need to verify Inf/NaNs (where most of the complexity in the
conversion comes from actually).

v2: drop the (very bogus) rgb9e5 part, and do component extraction
in the helper code for r11g11b10 to float conversion, making the code
slightly more compact (suggested by Jose), now that there are no other
callers left this works quite well. (Could do the same for the
opposite way but it's less than ideal there, final part of packing
needs to be done in caller anyway and there'd be another conditional.)
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |  250 +++
 src/gallium/auxiliary/gallivm/lp_bld_conv.h |9 +
 src/gallium/drivers/llvmpipe/lp_screen.c|6 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  126 ++
 4 files changed, 389 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index dc3649d..06d64c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -155,6 +155,256 @@ lp_build_bswap_vec(struct gallivm_state *gallivm,
 
 
 /**
+ * Convert float32 to a float-like value with less exponent and mantissa
+ * bits. The mantissa is still biased, and the mantissa still has an implied 1,
+ * but there's no sign bit.
+ *
+ * @param src (vector) float value to convert
+ * @param mantissa_bits   the number of mantissa bits
+ * @param exponent_bits   the number of exponent bits
+ *
+ * Unlike float_to_half using accurate method here.
+ * This implements round-towards-zero (trunc) hence too large numbers get
+ * converted to largest representable number, not infinity.
+ * Small numbers may get converted to denorms, depending on normal
+ * float denorm handling of the cpu.
+ * Note that compared to the references, below, we skip any rounding bias
+ * and do strict rounding towards zero (if I got the constants right...)
+ * - OpenGL allows rounding towards zero (though not preferred) and
+ * DX10 even seems to require it.
+ * Note that this will not try to pack the values somehow - they will
+ * look like rescaled floats (except for Inf/NaN) (but returned as
+ * (vector) int32).
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ * ref https://gist.github.com/rygorous/2156668
+ */
+static LLVMValueRef
+lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
+LLVMValueRef src,
+unsigned mantissa_bits,
+unsigned exponent_bits)
+{
+   LLVMBuilderRef builder = gallivm-builder;
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
+   LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
+   LLVMValueRef isnan, isposinf, isnanorposinf, i32_qnanbit, nanorposinfnum;
+   unsigned length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
+LLVMGetVectorSize(src_type) : 1;
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
+   struct lp_build_context f32_bld, i32_bld;
+   LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
+
+   lp_build_context_init(f32_bld, gallivm, f32_type);
+   lp_build_context_init(i32_bld, gallivm, i32_type);
+
+   i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
+ ((1  exponent_bits) - 1)  23);
+   i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff  23);
+
+   /* ordinary number */
+   /* clamp to pos range (can still have sign bit if NaN but doesn't matter) */
+   clamped = lp_build_max(f32_bld, 

[Mesa-dev] [PATCH] gallivm: fix returning unconditionally from main on TGSI_OPCODE_RET

2013-03-15 Thread sroland
From: Roland Scheidegger srol...@vmware.com

If we're in some conditional we must not return, or the code after
the condition is never executed.
(Probably the same for loops.)
This fixes https://bugs.freedesktop.org/show_bug.cgi?id=62357.

Note: This is a candidate for the stable branches.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0dc26b5..b5f0ace 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -348,7 +348,9 @@ static void lp_exec_mask_ret(struct lp_exec_mask *mask, int 
*pc)
LLVMBuilderRef builder = mask-bld-gallivm-builder;
LLVMValueRef exec_mask;
 
-   if (mask-call_stack_size == 0) {
+   if (mask-call_stack_size == 0 
+   mask-cond_stack_size == 0 
+   mask-loop_stack_size == 0) {
   /* returning from main() */
   *pc = -1;
   return;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix return opcode handling in main function of a shader

2013-03-15 Thread sroland
From: Roland Scheidegger srol...@vmware.com

If we're in some conditional or loop we must not return, or the code
after the condition is never executed.
(v2): And, we also can't just continue as nothing happened, since the
mask update code would later check if we actually have a mask, so we
need to remember that there was a return in main where we didn't exit
(to illustrate this, a ret in a if clause would cause a mask update
which is still ok as we're in a conditional, but after the endif the
mask update code would drop the mask hence bringing execution back to
pixels which should have their execution mask set to zero by the ret).
Thanks to Christoph Bumiller for figuring this out.

This fixes https://bugs.freedesktop.org/show_bug.cgi?id=62357.

Note: This is a candidate for the stable branches.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h |1 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |   20 +---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index dac97c3..6e65e12 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -243,6 +243,7 @@ struct lp_exec_mask {
struct lp_build_context *bld;
 
boolean has_mask;
+   boolean ret_in_main;
 
LLVMTypeRef int_vec_type;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0dc26b5..965255a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -73,6 +73,7 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, 
struct lp_build_context
 
mask-bld = bld;
mask-has_mask = FALSE;
+   mask-ret_in_main = FALSE;
mask-cond_stack_size = 0;
mask-loop_stack_size = 0;
mask-call_stack_size = 0;
@@ -108,7 +109,7 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
} else
   mask-exec_mask = mask-cond_mask;
 
-   if (mask-call_stack_size) {
+   if (mask-call_stack_size || mask-ret_in_main) {
   mask-exec_mask = LLVMBuildAnd(builder,
  mask-exec_mask,
  mask-ret_mask,
@@ -117,7 +118,8 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
 
mask-has_mask = (mask-cond_stack_size  0 ||
  mask-loop_stack_size  0 ||
- mask-call_stack_size  0);
+ mask-call_stack_size  0 ||
+ mask-ret_in_main);
 }
 
 static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
@@ -348,11 +350,23 @@ static void lp_exec_mask_ret(struct lp_exec_mask *mask, 
int *pc)
LLVMBuilderRef builder = mask-bld-gallivm-builder;
LLVMValueRef exec_mask;
 
-   if (mask-call_stack_size == 0) {
+   if (mask-cond_stack_size == 0 
+   mask-loop_stack_size == 0 
+   mask-call_stack_size == 0) {
   /* returning from main() */
   *pc = -1;
   return;
}
+
+   if (mask-call_stack_size == 0) {
+  /*
+   * This requires special handling since we need to ensure
+   * we don't drop the mask even if we have no call stack
+   * (e.g. after a ret in a if clause after the endif)
+   */
+  mask-ret_in_main = TRUE;
+   }
+
exec_mask = LLVMBuildNot(builder,
 mask-exec_mask,
 ret);
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] softpipe: don't assert when creating surfaces with multiple layers

2013-03-13 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We can't handle them yet, however we can safely just warn (we will
just render to first layer, which is fine since we can't handle
rendertarget system value neither).
Also make behavior more predictable with buffer surfaces
(it would sometimes hit bogus asserts because of the union in the surface,
instead create the surface but assert when trying to set a buffer
in the framebuffer).
---
 src/gallium/drivers/softpipe/sp_texture.c|   30 +-
 src/gallium/drivers/softpipe/sp_tile_cache.c |   18 ++--
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_texture.c 
b/src/gallium/drivers/softpipe/sp_texture.c
index 0d1481a..2db0de8 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -283,10 +283,6 @@ softpipe_create_surface(struct pipe_context *pipe,
 const struct pipe_surface *surf_tmpl)
 {
struct pipe_surface *ps;
-   unsigned level = surf_tmpl-u.tex.level;
-
-   assert(level = pt-last_level);
-   assert(surf_tmpl-u.tex.first_layer == surf_tmpl-u.tex.last_layer);
 
ps = CALLOC_STRUCT(pipe_surface);
if (ps) {
@@ -294,12 +290,26 @@ softpipe_create_surface(struct pipe_context *pipe,
   pipe_resource_reference(ps-texture, pt);
   ps-context = pipe;
   ps-format = surf_tmpl-format;
-  ps-width = u_minify(pt-width0, level);
-  ps-height = u_minify(pt-height0, level);
-
-  ps-u.tex.level = level;
-  ps-u.tex.first_layer = surf_tmpl-u.tex.first_layer;
-  ps-u.tex.last_layer = surf_tmpl-u.tex.last_layer;
+  if (pt-target != PIPE_BUFFER) {
+ assert(surf_tmpl-u.tex.level = pt-last_level);
+ ps-width = u_minify(pt-width0, surf_tmpl-u.tex.level);
+ ps-height = u_minify(pt-height0, surf_tmpl-u.tex.level);
+ ps-u.tex.level = surf_tmpl-u.tex.level;
+ ps-u.tex.first_layer = surf_tmpl-u.tex.first_layer;
+ ps-u.tex.last_layer = surf_tmpl-u.tex.last_layer;
+ if (ps-u.tex.first_layer != ps-u.tex.last_layer) {
+debug_printf(creating surface with multiple layers, rendering to 
first layer only\n);
+ }
+  }
+  else {
+ /* setting width as number of elements should get us correct 
renderbuffer width */
+ ps-width = surf_tmpl-u.buf.last_element - 
surf_tmpl-u.buf.first_element + 1;
+ ps-height = pt-height0;
+ ps-u.buf.first_element = surf_tmpl-u.buf.first_element;
+ ps-u.buf.last_element = surf_tmpl-u.buf.last_element;
+ assert(ps-u.buf.first_element = ps-u.buf.last_element);
+ assert(ps-u.buf.last_element  ps-width);
+  }
}
return ps;
 }
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c 
b/src/gallium/drivers/softpipe/sp_tile_cache.c
index dded0e1..b6dd6af 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -170,12 +170,18 @@ sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
tc-surface = ps;
 
if (ps) {
-  tc-transfer_map = pipe_transfer_map(pipe, ps-texture,
-   ps-u.tex.level, 
ps-u.tex.first_layer,
-   PIPE_TRANSFER_READ_WRITE |
-   PIPE_TRANSFER_UNSYNCHRONIZED,
-   0, 0, ps-width, ps-height,
-   tc-transfer);
+  if (ps-texture-target != PIPE_BUFFER) {
+ tc-transfer_map = pipe_transfer_map(pipe, ps-texture,
+  ps-u.tex.level, 
ps-u.tex.first_layer,
+  PIPE_TRANSFER_READ_WRITE |
+  PIPE_TRANSFER_UNSYNCHRONIZED,
+  0, 0, ps-width, ps-height,
+  tc-transfer);
+  }
+  else {
+ /* can't render to buffers */
+ assert(0);
+  }
 
   tc-depth_stencil = util_format_is_depth_or_stencil(ps-format);
}
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] llvmpipe: don't assert when trying to render to surfaces with multiple layers

2013-03-13 Thread sroland
From: Roland Scheidegger srol...@vmware.com

instead just warn when creating the surface, rendering will simply happen
to first layer.
---
 src/gallium/drivers/llvmpipe/lp_scene.c   |2 --
 src/gallium/drivers/llvmpipe/lp_texture.c |3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c 
b/src/gallium/drivers/llvmpipe/lp_scene.c
index a0912eb..a888586 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -157,7 +157,6 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
for (i = 0; i  scene-fb.nr_cbufs; i++) {
   struct pipe_surface *cbuf = scene-fb.cbufs[i];
   if (llvmpipe_resource_is_texture(cbuf-texture)) {
- assert(cbuf-u.tex.first_layer == cbuf-u.tex.last_layer);
  scene-cbufs[i].stride = llvmpipe_resource_stride(cbuf-texture,
cbuf-u.tex.level);
 
@@ -178,7 +177,6 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
 
if (fb-zsbuf) {
   struct pipe_surface *zsbuf = scene-fb.zsbuf;
-  assert(zsbuf-u.tex.first_layer == zsbuf-u.tex.last_layer);
   scene-zsbuf.stride = llvmpipe_resource_stride(zsbuf-texture, 
zsbuf-u.tex.level);
   scene-zsbuf.blocksize = 
  util_format_get_blocksize(zsbuf-texture-format);
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 9de05e7..99bd6d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -593,6 +593,9 @@ llvmpipe_create_surface(struct pipe_context *pipe,
  ps-u.tex.level = surf_tmpl-u.tex.level;
  ps-u.tex.first_layer = surf_tmpl-u.tex.first_layer;
  ps-u.tex.last_layer = surf_tmpl-u.tex.last_layer;
+ if (ps-u.tex.first_layer != ps-u.tex.last_layer) {
+debug_printf(creating surface with multiple layers, rendering to 
first layer only\n);
+ }
   }
   else {
  /* setting width as number of elements should get us correct 
renderbuffer width */
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] tgsi: fix sample_d emit for arrays

2013-03-13 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Those cases were apparently forgotten.
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c |   30 +++---
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 4488397..3df3ac3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2371,50 +2371,42 @@ exec_sample_d(struct tgsi_exec_machine *mach,
/* always fetch all 3 offsets, overkill but keeps code simple */
fetch_texel_offsets(mach, inst, offsets);
 
+   FETCH(r[0], 0, TGSI_CHAN_X);
+
switch (mach-SamplerViews[resource_unit].Resource) {
case TGSI_TEXTURE_1D:
-  FETCH(r[0], 0, TGSI_CHAN_X);
+   case TGSI_TEXTURE_1D_ARRAY:
+  /* only 1D array actually needs Y */
+  FETCH(r[1], 0, TGSI_CHAN_Y);
 
   fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
 
   fetch_texel(mach-Sampler, resource_unit, sampler_unit,
-  r[0], ZeroVec, ZeroVec, ZeroVec, ZeroVec,   /* S, T, P, 
C, LOD */
+  r[0], r[1], ZeroVec, ZeroVec, ZeroVec,   /* S, T, P, C, 
LOD */
   derivs, offsets, tgsi_sampler_derivs_explicit,
   r[0], r[1], r[2], r[3]);   /* R, G, B, A */
   break;
 
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
-  FETCH(r[0], 0, TGSI_CHAN_X);
+   case TGSI_TEXTURE_2D_ARRAY:
+  /* only 2D array actually needs Z */
   FETCH(r[1], 0, TGSI_CHAN_Y);
+  FETCH(r[2], 0, TGSI_CHAN_Z);
 
   fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
   fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
 
   fetch_texel(mach-Sampler, resource_unit, sampler_unit,
-  r[0], r[1], ZeroVec, ZeroVec, ZeroVec,   /* inputs */
+  r[0], r[1], r[2], ZeroVec, ZeroVec,   /* inputs */
   derivs, offsets, tgsi_sampler_derivs_explicit,
   r[0], r[1], r[2], r[3]); /* outputs */
   break;
 
case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_CUBE:
-  FETCH(r[0], 0, TGSI_CHAN_X);
-  FETCH(r[1], 0, TGSI_CHAN_Y);
-  FETCH(r[2], 0, TGSI_CHAN_Z);
-
-  fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
-  fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
-  fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
-
-  fetch_texel(mach-Sampler, resource_unit, sampler_unit,
-  r[0], r[1], r[2], ZeroVec, ZeroVec,
-  derivs, offsets, tgsi_sampler_derivs_explicit,
-  r[0], r[1], r[2], r[3]);
-  break;
-
case TGSI_TEXTURE_CUBE_ARRAY:
-  FETCH(r[0], 0, TGSI_CHAN_X);
+  /* only cube array actually needs W */
   FETCH(r[1], 0, TGSI_CHAN_Y);
   FETCH(r[2], 0, TGSI_CHAN_Z);
   FETCH(r[3], 0, TGSI_CHAN_W);
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] tgsi: emit code for SVIEWINFO and SAMPLE_I

2013-03-08 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Can handle them since the single sampler interface was introduced.
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c |   18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 9945d42..8c6890b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2139,7 +2139,8 @@ exec_txd(struct tgsi_exec_machine *mach,
 
 static void
 exec_txf(struct tgsi_exec_machine *mach,
- const struct tgsi_full_instruction *inst)
+ const struct tgsi_full_instruction *inst,
+ boolean is_samplei)
 {
const uint unit = inst-Src[1].Register.Index;
union tgsi_exec_channel r[4];
@@ -2147,13 +2148,20 @@ exec_txf(struct tgsi_exec_machine *mach,
float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
int j;
int8_t offsets[3];
+   unsigned target;
 
/* always fetch all 3 offsets, overkill but keeps code simple */
fetch_texel_offsets(mach, inst, offsets);
 
IFETCH(r[3], 0, TGSI_CHAN_W);
 
-   switch(inst-Texture.Texture) {
+   if (is_samplei) {
+  target = mach-SamplerViews[unit].Resource;
+   }
+   else {
+  target = inst-Texture.Texture;
+   }
+   switch(target) {
case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_2D_ARRAY:
case TGSI_TEXTURE_SHADOW2D_ARRAY:
@@ -4112,7 +4120,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_TXF:
-  exec_txf(mach, inst);
+  exec_txf(mach, inst, FALSE);
   break;
 
case TGSI_OPCODE_TXQ:
@@ -4339,7 +4347,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_SAMPLE_I:
-  assert(0);
+  exec_txf(mach, inst, TRUE);
   break;
 
case TGSI_OPCODE_SAMPLE_I_MS:
@@ -4375,7 +4383,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_SVIEWINFO:
-  assert(0);
+  exec_txq(mach, inst);
   break;
 
case TGSI_OPCODE_SAMPLE_POS:
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: clean up passing derivatives around

2013-03-08 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Previously, the derivatives were calculated and passed in a packed form
to the sample code (for implicit derivatives, explicit derivatives were
packed to the same format).
There's several reasons why this wasn't such a good idea:
1) the derivatives may not even be needed (not as bad as it sounds since
llvm will just throw the calculations needed for them away but still)
2) the special packing format really shouldn't be part of the sampler
interface
3) depending what the sample code actually does the derivatives will
be processed differently, hence there is no ideal packing. For cube
maps with explicit derivatives (which we don't do yet) for instance the
packing looked downright useless, and for non-isotropic filtering we'd
need different calculations too.

So, instead just pass the derivatives as is (for explicit derivatives),
or let the rho calculating sample code calculate them itself. This still
does exactly the same packing stuff for implicit derivatives for now,
though explicit ones are handled in a more straightforward manner (quick
estimates show performance should be quite similar, though it is much
easier to follow and also does the rho calculation per-pixel until the
end, which we eventually need for spec compliance anyway).

No piglit changes.
---
 src/gallium/auxiliary/gallivm/lp_bld_quad.c   |   14 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  271 +
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |6 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |   11 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c   |   21 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c   |  122 +-
 6 files changed, 196 insertions(+), 249 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c 
b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 8a0efed..1955add 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -79,14 +79,9 @@ lp_build_ddy(struct lp_build_context *bld,
 }
 
 /*
- * To be able to handle multiple quads at once in texture sampling and
- * do lod calculations per quad, it is necessary to get the per-quad
- * derivatives into the lp_build_rho function.
- * For 8-wide vectors the packed derivative values for 3 coords would
- * look like this, this scales to a arbitrary (multiple of 4) vector size:
- * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * Helper for building packed ddx/ddy vector for one coord (scalar per quad
+ * values). The vector will look like this (8-wide):
  * dr1dx dr1dy _ _ dr2dx dr2dy _ _
- * The second vector will be unused for 1d and 2d textures.
  */
 LLVMValueRef
 lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
@@ -121,6 +116,11 @@ lp_build_packed_ddx_ddy_onecoord(struct lp_build_context 
*bld,
 }
 
 
+/*
+ * Helper for building packed ddx/ddy vector for one coord (scalar per quad
+ * values). The vector will look like this (8-wide):
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ */
 LLVMValueRef
 lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
  LLVMValueRef a, LLVMValueRef b)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index ef0631c..fc8bae7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -46,6 +46,7 @@
 #include lp_bld_type.h
 #include lp_bld_logic.h
 #include lp_bld_pack.h
+#include lp_bld_quad.h
 
 
 /*
@@ -203,6 +204,9 @@ lp_sampler_static_sampler_state(struct 
lp_static_sampler_state *state,
 static LLVMValueRef
 lp_build_rho(struct lp_build_sample_context *bld,
  unsigned texture_unit,
+ LLVMValueRef s,
+ LLVMValueRef t,
+ LLVMValueRef r,
  const struct lp_derivatives *derivs)
 {
struct gallivm_state *gallivm = bld-gallivm;
@@ -211,8 +215,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
struct lp_build_context *float_bld = bld-float_bld;
struct lp_build_context *coord_bld = bld-coord_bld;
struct lp_build_context *perquadf_bld = bld-perquadf_bld;
-   const LLVMValueRef *ddx_ddy = derivs-ddx_ddy;
const unsigned dims = bld-dims;
+   LLVMValueRef ddx_ddy[2];
LLVMBuilderRef builder = bld-gallivm-builder;
LLVMTypeRef i32t = LLVMInt32TypeInContext(bld-gallivm-context);
LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
@@ -229,59 +233,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
LLVMValueRef i32undef = 
LLVMGetUndef(LLVMInt32TypeInContext(gallivm-context));
LLVMValueRef rho_xvec, rho_yvec;
 
-   abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-   if (dims  2) {
-  abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
-   }
-   else {
-  abs_ddx_ddy[1] = NULL;
-   }
-
-   if (dims == 1) {
-  static const unsigned char swizzle1[] = {
- 0, 

[Mesa-dev] [PATCH] tgsi: handle projection modifier for array textures.

2013-03-05 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This partly reverts 6ace2e41da7dded630d932d03bacb7e14a93d47a.
Apparently with GL_MESA_texture_array fixed-function texturing
with texture arrays is possible, and hence we have to handle TXP.
(Though noone seems to know the semantics, softpipe now does what
it did before, which is to NOT project the array coord, llvmpipe
for instance however indeed does project the array coord. Unlike
before it will project the comparison coord for shadow1d array, as
that clearly was an error.)
This fixes https://bugs.freedesktop.org/show_bug.cgi?id=61828.
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c |   14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index da5594b..6a74ef3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1908,7 +1908,9 @@ exec_tex(struct tgsi_exec_machine *mach,
   FETCH(r[0], 0, TGSI_CHAN_X);
   FETCH(r[1], 0, TGSI_CHAN_Y);
 
-  assert(modifier != TEX_MODIFIER_PROJECTED);
+  if (modifier == TEX_MODIFIER_PROJECTED) {
+ micro_div(r[0], r[0], r[3]);
+  }
 
   fetch_texel(mach-Sampler, unit, unit,
   r[0], r[1], ZeroVec, ZeroVec, lod,   /* S, T, P, C, LOD 
*/
@@ -1920,7 +1922,10 @@ exec_tex(struct tgsi_exec_machine *mach,
   FETCH(r[1], 0, TGSI_CHAN_Y);
   FETCH(r[2], 0, TGSI_CHAN_Z);
 
-  assert(modifier != TEX_MODIFIER_PROJECTED);
+  if (modifier == TEX_MODIFIER_PROJECTED) {
+ micro_div(r[0], r[0], r[3]);
+ micro_div(r[2], r[2], r[3]);
+  }
 
   fetch_texel(mach-Sampler, unit, unit,
   r[0], r[1], r[2], ZeroVec, lod,   /* S, T, P, C, LOD */
@@ -1933,7 +1938,10 @@ exec_tex(struct tgsi_exec_machine *mach,
   FETCH(r[1], 0, TGSI_CHAN_Y);
   FETCH(r[2], 0, TGSI_CHAN_Z);
 
-  assert(modifier != TEX_MODIFIER_PROJECTED);
+  if (modifier == TEX_MODIFIER_PROJECTED) {
+ micro_div(r[0], r[0], r[3]);
+ micro_div(r[1], r[1], r[3]);
+  }
 
   fetch_texel(mach-Sampler, unit, unit,
   r[0], r[1], r[2], ZeroVec, lod,   /* S, T, P, C, LOD */
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw/llvm: skip clipping and viewport transform if there's no position output

2013-03-01 Thread sroland
From: Roland Scheidegger srol...@vmware.com

With glsl 1.40 writing position is not required (useful for transform
feedback, though in fact it's still possible to rasterize such geometry
even if the results aren't too well defined).
Prevents crashes in that case. Fixes piglit glsl-1.40-tf-no-position.
Not quite sure this is 100% correct as it also skips clipdistance
clipping which could still work (but not sure if the result would
really be needed?)
---
 src/gallium/auxiliary/draw/draw_llvm.c |   57 +---
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 8e46687..763158b 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1349,36 +1349,41 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
   sampler,
   variant-key.clamp_vertex_color);
 
-  /* store original positions in clip before further manipulation */
-  store_clip(gallivm, vs_type, io, outputs, 0, cv);
-  store_clip(gallivm, vs_type, io, outputs, 1, pos);
-
-  /* do cliptest */
-  if (enable_cliptest) {
- LLVMValueRef temp = LLVMBuildLoad(builder, clipmask_bool_ptr, );
- /* allocate clipmask, assign it integer type */
- clipmask = generate_clipmask(llvm,
-  gallivm,
-  vs_type,
-  outputs,
-  variant-key.clip_xy,
-  variant-key.clip_z, 
-  variant-key.clip_user,
-  variant-key.clip_halfz,
-  variant-key.ucp_enable,
-  context_ptr, have_clipdist);
- temp = LLVMBuildOr(builder, clipmask, temp, );
- /* store temporary clipping boolean value */
- LLVMBuildStore(builder, temp, clipmask_bool_ptr);
+  if (pos != -1) {
+ /* store original positions in clip before further manipulation */
+ store_clip(gallivm, vs_type, io, outputs, 0, cv);
+ store_clip(gallivm, vs_type, io, outputs, 1, pos);
+
+ /* do cliptest */
+ if (enable_cliptest) {
+LLVMValueRef temp = LLVMBuildLoad(builder, clipmask_bool_ptr, );
+/* allocate clipmask, assign it integer type */
+clipmask = generate_clipmask(llvm,
+ gallivm,
+ vs_type,
+ outputs,
+ variant-key.clip_xy,
+ variant-key.clip_z,
+ variant-key.clip_user,
+ variant-key.clip_halfz,
+ variant-key.ucp_enable,
+ context_ptr, have_clipdist);
+temp = LLVMBuildOr(builder, clipmask, temp, );
+/* store temporary clipping boolean value */
+LLVMBuildStore(builder, temp, clipmask_bool_ptr);
+ }
+ else {
+clipmask = lp_build_const_int_vec(gallivm, lp_int_type(vs_type), 
0);
+ }
+
+ /* do viewport mapping */
+ if (!bypass_viewport) {
+generate_viewport(variant, builder, vs_type, outputs, context_ptr);
+ }
   }
   else {
  clipmask = lp_build_const_int_vec(gallivm, lp_int_type(vs_type), 0);
   }
-  
-  /* do viewport mapping */
-  if (!bypass_viewport) {
- generate_viewport(variant, builder, vs_type, outputs, context_ptr);
-  }
 
   /* store clipmask in vertex header, 
* original positions in clip 
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: don't assert on illegal surface creation.

2013-03-01 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Since c8eb2d0e829d0d2aea6a982620da0d3cfb5982e2 llvmpipe checks if it's
actually legal to create a surface. The opengl state tracker doesn't quite
obey this so for now just warn instead of assert.
Also warn instead of disabled assert when creating sampler views
(same reasoning).

Addresses https://bugs.freedesktop.org/show_bug.cgi?id=61647.
---
 src/gallium/drivers/llvmpipe/lp_state_sampler.c |5 ++---
 src/gallium/drivers/llvmpipe/lp_texture.c   |3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 3b31d4f..7441973 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -218,9 +218,8 @@ llvmpipe_create_sampler_view(struct pipe_context *pipe,
 * XXX we REALLY want to see the correct bind flag here but the OpenGL
 * state tracker can't guarantee that at least for texture buffer objects.
 */
-#if 0
-   assert(texture-bind  PIPE_BIND_SAMPLER_VIEW);
-#endif
+   if (!(texture-bind  PIPE_BIND_SAMPLER_VIEW))
+  debug_printf(Illegal sampler view creation without bind flag\n);
 
if (view) {
   *view = *templ;
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index e4ae3c1..9de05e7 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -577,7 +577,8 @@ llvmpipe_create_surface(struct pipe_context *pipe,
struct pipe_surface *ps;
 
assert(surf_tmpl-u.tex.level = pt-last_level);
-   assert(pt-bind  (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET));
+   if (!(pt-bind  (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET)))
+  debug_printf(Illegal surface creation without bind flag\n);
 
ps = CALLOC_STRUCT(pipe_surface);
if (ps) {
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] tgsi: add texel offsets and derivatives to sampler interface

2013-03-01 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Something I never got around to implement, but this is the tgsi execution
side for implementing texel offsets (for ordinary texturing) and explicit
derivatives for sampling (though I guess the ordering of the components
for the derivs parameters is debatable).
There is certainly a runtime cost associated with this.
Unless there are different interfaces used depending on the complexity
of the texture instructions, this is impossible to avoid.
Offsets are always active (I think checking if they are active or not is
probably not worth it since it should mostly be an add), whereas the
sampler_control is extended for explicit derivatives.
For now softpipe (the only user of this) just drops all those new values
on the floor (which is the part I never implemented...).

Additionally this also fixes (discovered by accident) inconsistent
projective divide for the comparison coord - the code did do the
projection for shadow2d targets, but not shadow1d ones. This also
drops checking for projection modifier on array targets, since they
aren't possible in any extension I know of (hence we don't actually
know if the array layer should also be divided or not).
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c   |  255 --
 src/gallium/auxiliary/tgsi/tgsi_exec.h   |   10 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c |   10 +-
 3 files changed, 210 insertions(+), 65 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 6277c3e..feb5928 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1720,6 +1720,8 @@ fetch_texel( struct tgsi_sampler *sampler,
  const union tgsi_exec_channel *p,
  const union tgsi_exec_channel *c0,
  const union tgsi_exec_channel *c1,
+ float derivs[3][2][TGSI_QUAD_SIZE],
+ const int8_t offset[3],
  enum tgsi_sampler_control control,
  union tgsi_exec_channel *r,
  union tgsi_exec_channel *g,
@@ -1731,7 +1733,7 @@ fetch_texel( struct tgsi_sampler *sampler,
 
/* FIXME: handle explicit derivs, offsets */
sampler-get_samples(sampler, sview_idx, sampler_idx,
-s-f, t-f, p-f, c0-f, c1-f, control, rgba);
+s-f, t-f, p-f, c0-f, c1-f, derivs, offset, 
control, rgba);
 
for (j = 0; j  4; j++) {
   r-f[j] = rgba[0][j];
@@ -1765,6 +1767,23 @@ exec_tex(struct tgsi_exec_machine *mach,
const union tgsi_exec_channel *lod = ZeroVec;
enum tgsi_sampler_control control =  tgsi_sampler_lod_none;
uint chan;
+   int8_t offsets[3];
+
+   if (inst-Texture.NumOffsets == 1) {
+  union tgsi_exec_channel index;
+  union tgsi_exec_channel offset[3];
+  index.i[0] = index.i[1] = index.i[2] = index.i[3] = 
inst-TexOffsets[0].Index;
+  fetch_src_file_channel(mach, 0, inst-TexOffsets[0].File,
+ inst-TexOffsets[0].SwizzleX, index, ZeroVec, 
offset[0]);
+  fetch_src_file_channel(mach, 0, inst-TexOffsets[0].File,
+ inst-TexOffsets[0].SwizzleY, index, ZeroVec, 
offset[1]);
+  fetch_src_file_channel(mach, 0, inst-TexOffsets[0].File,
+ inst-TexOffsets[0].SwizzleZ, index, ZeroVec, 
offset[2]);
+ offsets[0] = offset[0].i[0];
+ offsets[1] = offset[1].i[0];
+ offsets[2] = offset[2].i[0];
+   } else
+ offsets[0] = offsets[1] = offsets[2] = 0;
 
assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
 
@@ -1791,25 +1810,41 @@ exec_tex(struct tgsi_exec_machine *mach,
 
   fetch_texel(mach-Sampler, unit, unit,
   r[0], ZeroVec, ZeroVec, ZeroVec, lod, /* S, T, P, C, LOD 
*/
-  control,
+  NULL, offsets, control,
   r[0], r[1], r[2], r[3]); /* R, G, B, A */
   break;
+
case TGSI_TEXTURE_SHADOW1D:
   FETCH(r[0], 0, TGSI_CHAN_X);
   FETCH(r[2], 0, TGSI_CHAN_Z);
 
   if (modifier == TEX_MODIFIER_PROJECTED) {
  micro_div(r[0], r[0], r[3]);
+ micro_div(r[2], r[2], r[3]);
   }
 
   fetch_texel(mach-Sampler, unit, unit,
   r[0], ZeroVec, r[2], ZeroVec, lod, /* S, T, P, C, LOD */
-  control,
+  NULL, offsets, control,
   r[0], r[1], r[2], r[3]); /* R, G, B, A */
   break;
 
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
+  FETCH(r[0], 0, TGSI_CHAN_X);
+  FETCH(r[1], 0, TGSI_CHAN_Y);
+
+  if (modifier == TEX_MODIFIER_PROJECTED) {
+ micro_div(r[0], r[0], r[3]);
+ micro_div(r[1], r[1], r[3]);
+  }
+
+  fetch_texel(mach-Sampler, unit, unit,
+  r[0], r[1], ZeroVec, ZeroVec, lod,/* S, T, P, C, LOD 
*/
+  NULL, offsets, control,
+  r[0], r[1], r[2], r[3]);  /* outputs */
+  break;
+
case TGSI_TEXTURE_SHADOW2D:
case 

[Mesa-dev] [PATCH 1/2] draw: fix no position output in non-llvm pipeline.

2013-03-01 Thread sroland
From: Roland Scheidegger srol...@vmware.com

It seems easiest (and best) if we simply skip all the later stages
(after stream output).
(This is different to the llvm case at least for now where we will
simply try to render garbage, though both behaviors should be correct.)
Fixes piglit glsl-1.40-tf-no-position with softpipe.
---
 .../auxiliary/draw/draw_pt_fetch_shade_pipeline.c  |   39 
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c 
b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 2fc8220..f0a48df 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -288,23 +288,30 @@ static void fetch_pipeline_generic( struct 
draw_pt_middle_end *middle,
 vert_info,
 prim_info );
 
-   if (draw_pt_post_vs_run( fpme-post_vs,
-vert_info ))
-   {
-  opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
+   /*
+* if there's no position, need to stop now, or the latter stages
+* will try to access non-existent position output.
 */
-   if (opt  PT_PIPELINE) {
-  pipeline( fpme,
-vert_info,
-prim_info );
-   }
-   else {
-  emit( fpme-emit,
-vert_info,
-prim_info );
+   if (draw_current_shader_position_output(draw) != -1) {
+
+  if (draw_pt_post_vs_run( fpme-post_vs,
+   vert_info ))
+  {
+ opt |= PT_PIPELINE;
+  }
+
+  /* Do we need to run the pipeline?
+   */
+  if (opt  PT_PIPELINE) {
+ pipeline( fpme,
+   vert_info,
+   prim_info );
+  }
+  else {
+ emit( fpme-emit,
+   vert_info,
+   prim_info );
+  }
}
FREE(vert_info-verts);
 }
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] gallivm: add support for texel offsets for ordinary texturing.

2013-02-28 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This was previously only handled for texelFetch (much easier).
Depending on the wrap mode this works slightly differently (for somewhat
efficient implementation), hence have to do that separately in all roughly
137 places - it is easy if we use fixed point coords for wrapping, however
some wrapping modes are near impossible with fixed point (the repeat stuff)
hence we have to normalize the offsets if we can't do the wrapping in
unnormalized space (which is a division which is slow but should still be
much better than the alternative, which would be integer modulo for wrapping
which is just unusable). This should still give accurate results in all
cases that really matter, though it might be not quite conformant behavior
for some apis (but we have much worse problems there anyway even without
using offsets).
(Untested, no piglit test.)
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c |  130 +++-
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h |1 +
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  132 +
 3 files changed, 210 insertions(+), 53 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index bddff2c..16d5718 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -64,9 +64,11 @@
  * for scaled integer texcoords.
  * \param block_length  is the length of the pixel block along the
  *  coordinate axis
- * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param coord  the incoming texcoord (s,t or r) scaled to the texture size
+ * \param coord_f  the incoming texcoord (s,t or r) as float vec
  * \param length  the texture size along one dimension
  * \param stride  pixel stride along the coordinate axis (in bytes)
+ * \param offset  the texel offset along the coord axis
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
  * \param out_offset  byte offset for the wrapped coordinate
@@ -79,6 +81,7 @@ lp_build_sample_wrap_nearest_int(struct 
lp_build_sample_context *bld,
  LLVMValueRef coord_f,
  LLVMValueRef length,
  LLVMValueRef stride,
+ LLVMValueRef offset,
  boolean is_pot,
  unsigned wrap_mode,
  LLVMValueRef *out_offset,
@@ -97,6 +100,11 @@ lp_build_sample_wrap_nearest_int(struct 
lp_build_sample_context *bld,
   else {
  struct lp_build_context *coord_bld = bld-coord_bld;
  LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+ if (offset) {
+offset = lp_build_int_to_float(coord_bld, offset);
+offset = lp_build_div(coord_bld, offset, length_f);
+coord_f = lp_build_add(coord_bld, coord_f, offset);
+ }
  coord = lp_build_fract_safe(coord_bld, coord_f);
  coord = lp_build_mul(coord_bld, coord, length_f);
  coord = lp_build_itrunc(coord_bld, coord);
@@ -126,8 +134,9 @@ lp_build_sample_wrap_nearest_int(struct 
lp_build_sample_context *bld,
 /**
  * Build LLVM code for texture coord wrapping, for nearest filtering,
  * for float texcoords.
- * \param coord  the incoming texcoord (s,t,r or q)
+ * \param coord  the incoming texcoord (s,t or r)
  * \param length  the texture size along one dimension
+ * \param offset  the texel offset along the coord axis
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
  * \param icoord  the texcoord after wrapping, as int
@@ -136,6 +145,7 @@ static void
 lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
LLVMValueRef coord,
LLVMValueRef length,
+   LLVMValueRef offset,
boolean is_pot,
unsigned wrap_mode,
LLVMValueRef *icoord)
@@ -145,6 +155,12 @@ lp_build_sample_wrap_nearest_float(struct 
lp_build_sample_context *bld,
 
switch(wrap_mode) {
case PIPE_TEX_WRAP_REPEAT:
+  if (offset) {
+ /* this is definitely not ideal for POT case */
+ offset = lp_build_int_to_float(coord_bld, offset);
+ offset = lp_build_div(coord_bld, offset, length);
+ coord = lp_build_add(coord_bld, coord, offset);
+  }
   /* take fraction, unnormalize */
   coord = lp_build_fract_safe(coord_bld, coord);
   coord = lp_build_mul(coord_bld, coord, length);
@@ -156,6 +172,10 @@ lp_build_sample_wrap_nearest_float(struct 
lp_build_sample_context *bld,
  /* scale coord to length */
  coord = 

[Mesa-dev] [PATCH 2/2] llvmpipe: bump glsl version to 130

2013-02-28 Thread sroland
From: Roland Scheidegger srol...@vmware.com

texel offsets should have been the last missing feature (not sure
if anything is actually missing for 140). In any case we still
don't do OpenGL 3.0 (missing MSAA which will be difficult,
plus EXT_packed_float, ARB_depth_buffer_float and EXT_framebuffer_sRGB).
---
 src/gallium/drivers/llvmpipe/lp_screen.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index 05bbca5..aab13b3 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -195,7 +195,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_VERTEX_COLOR_CLAMPED:
   return 1;
case PIPE_CAP_GLSL_FEATURE_LEVEL:
-  return 120;
+  return 130;
case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
   return 0;
case PIPE_CAP_COMPUTE:
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: bump glsl version to 140

2013-02-28 Thread sroland
From: Roland Scheidegger srol...@vmware.com

texel offsets should have been the last missing feature (not sure
if anything is actually missing for 140). In any case we still
don't do OpenGL 3.0 (missing MSAA which will be difficult,
plus EXT_packed_float, ARB_depth_buffer_float and EXT_framebuffer_sRGB).

v2: bump to 140 instead - we have everything except we crash when not writing
to gl_Position (but softpipe crashes as well) so let's just say this is a bug
instead. Also (by Dave Airlie's suggestion) update llvm-todo.txt.
---
 src/gallium/docs/llvm-todo.txt   |   16 +---
 src/gallium/drivers/llvmpipe/lp_screen.c |2 +-
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/src/gallium/docs/llvm-todo.txt b/src/gallium/docs/llvm-todo.txt
index a5a8c1a..02b4b62 100644
--- a/src/gallium/docs/llvm-todo.txt
+++ b/src/gallium/docs/llvm-todo.txt
@@ -4,18 +4,12 @@ TODO covering gallivm/llvmpipe
 Goal: GL3.0 support in llvmpipe
 ---
 
-TXQ opcode support - airlied WIP
-TXF opcode support.
-Integer texture fetch support
-Integer renderbuffer support
-Vertex ID support.
-EXT_transform_feedback support - airlied WIP
-clip distance support - airlied WIP
-vertex clip support - airlied WIP
-EXT_texture_array support - Jakob WIP
+EXT_packed_float support.
+ARB_depth_buffer_float support.
+EXT_framebuffer_sRGB support.
+MSAA support.
+
 
 Goal: extension parity with softpipe:
 -
-GL3.0 support.
-EXT_timer_query - airlied posted a patch
 
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index 05bbca5..aab13b3 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -195,7 +195,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_VERTEX_COLOR_CLAMPED:
   return 1;
case PIPE_CAP_GLSL_FEATURE_LEVEL:
-  return 120;
+  return 130;
case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
   return 0;
case PIPE_CAP_COMPUTE:
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: check buffers in llvmpipe_is_resource_referenced

2013-02-27 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Now that buffers can be used as textures or render targets
make sure they aren't skipped.

Fix suggested by Jose Fonseca.
---
 src/gallium/drivers/llvmpipe/lp_surface.c |   14 +++---
 src/gallium/drivers/llvmpipe/lp_texture.c |4 +++-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c 
b/src/gallium/drivers/llvmpipe/lp_surface.c
index a83a903..5e6a6eb 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -65,13 +65,6 @@ lp_resource_copy(struct pipe_context *pipe,
unsigned depth = src_box-depth;
unsigned z;
 
-   /* Fallback for buffers. */
-   if (dst-target == PIPE_BUFFER  src-target == PIPE_BUFFER) {
-  util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
-src, src_level, src_box);
-  return;
-   }
-
llvmpipe_flush_resource(pipe,
dst, dst_level,
FALSE, /* read_only */
@@ -86,6 +79,13 @@ lp_resource_copy(struct pipe_context *pipe,
FALSE, /* do_not_block */
blit src);
 
+   /* Fallback for buffers. */
+   if (dst-target == PIPE_BUFFER  src-target == PIPE_BUFFER) {
+  util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
+src, src_level, src_box);
+  return;
+   }
+
/*
printf(surface copy from %u lvl %u to %u lvl %u: %u,%u,%u to %u,%u,%u %u x 
%u x %u\n,
   src_tex-id, src_level, dst_tex-id, dst_level,
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index a64139f..75104bb 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -755,7 +755,9 @@ llvmpipe_is_resource_referenced( struct pipe_context *pipe,
 {
struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
 
-   if (presource-target == PIPE_BUFFER)
+   if (!(presource-bind  (PIPE_BIND_DEPTH_STENCIL |
+PIPE_BIND_RENDER_TARGET |
+PIPE_BIND_SAMPLER_VIEW)))
   return LP_UNREFERENCED;
 
return lp_setup_is_resource_referenced(llvmpipe-setup, presource);
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: support rendering to buffer render targets.

2013-02-26 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Unfortunately not usable from OpenGL, and no cap bit.
Pretty similar to a 1d texture, though allows specifying a start element.
The util code for handling clears also needs adjustments (and fix
a bug causing crashes for handling pure integer formats there too).
---
 src/gallium/auxiliary/util/u_surface.c  |   55 +++
 src/gallium/drivers/llvmpipe/lp_rast.c  |   25 ++--
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |4 +-
 src/gallium/drivers/llvmpipe/lp_scene.c |   35 +++--
 src/gallium/drivers/llvmpipe/lp_texture.c   |   44 +++--
 5 files changed, 108 insertions(+), 55 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_surface.c 
b/src/gallium/auxiliary/util/u_surface.c
index b948b46..fba0798 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -323,20 +323,59 @@ util_clear_render_target(struct pipe_context *pipe,
if (!dst-texture)
   return;
/* XXX: should handle multiple layers */
-   dst_map = pipe_transfer_map(pipe,
-   dst-texture,
-   dst-u.tex.level,
-   dst-u.tex.first_layer,
-   PIPE_TRANSFER_WRITE,
-   dstx, dsty, width, height, dst_trans);
+
+   if (dst-texture-target == PIPE_BUFFER) {
+  /*
+   * The fill naturally works on the surface format, however
+   * the transfer uses resource format which is just bytes for buffers.
+   */
+  unsigned dx, w;
+  unsigned pixstride = util_format_get_blocksize(dst-format);
+  dx = dstx * pixstride;
+  w = width * pixstride;
+  dst_map = pipe_transfer_map(pipe,
+  dst-texture,
+  0, 0,
+  PIPE_TRANSFER_WRITE,
+  dx, 0, w, 1,
+  dst_trans);
+  dst_map = (uint8_t *)dst_map + dst-u.buf.first_element * pixstride;
+   }
+   else {
+  /* XXX: should handle multiple layers */
+  dst_map = pipe_transfer_map(pipe,
+  dst-texture,
+  dst-u.tex.level,
+  dst-u.tex.first_layer,
+  PIPE_TRANSFER_WRITE,
+  dstx, dsty, width, height, dst_trans);
+
+   }
 
assert(dst_map);
 
if (dst_map) {
+  enum pipe_format format = dst-format;
   assert(dst_trans-stride  0);
 
-  util_pack_color(color-f, dst-texture-format, uc);
-  util_fill_rect(dst_map, dst-texture-format,
+  if (util_format_is_pure_integer(format)) {
+ /*
+  * We expect int/uint clear values here, though some APIs
+  * might disagree (but in any case util_pack_color()
+  * couldn't handle it)...
+  */
+ if (util_format_is_pure_sint(format)) {
+util_format_write_4i(format, color-i, 0, uc, 0, 0, 0, 1, 1);
+ }
+ else {
+assert(util_format_is_pure_uint(format));
+util_format_write_4ui(format, color-ui, 0, uc, 0, 0, 0, 1, 1);
+ }
+  }
+  else {
+ util_pack_color(color-f, dst-format, uc);
+  }
+  util_fill_rect(dst_map, dst-format,
  dst_trans-stride,
  0, 0, width, height, uc);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c 
b/src/gallium/drivers/llvmpipe/lp_rast.c
index b5e5da6..6183f41 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -165,32 +165,13 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
 
  for (i = 0; i  scene-fb.nr_cbufs; i++) {
 enum pipe_format format = scene-fb.cbufs[i]-format;
-/*
- * XXX the format_write_4i/ui functions do clamping to max value
- * and I'm not sure that's actually right - spec doesn't seem to
- * say much about that topic. If it is should probably adjust the
- * border color handling to do the same. If not and chopping off
- * bits is the way to go, the write_4i and write_4ui functions
- * would be identical.
- */
-if (util_format_is_pure_sint(format)) {
-   int rgba[4];
-   rgba[0] = arg.clear_color.i[0];
-   rgba[1] = arg.clear_color.i[1];
-   rgba[2] = arg.clear_color.i[2];
-   rgba[3] = arg.clear_color.i[3];
 
-   util_format_write_4i(format, rgba, 0, uc, 0, 0, 0, 1, 1);
+if (util_format_is_pure_sint(format)) {
+   util_format_write_4i(format, arg.clear_color.i, 0, uc, 0, 0, 
0, 1, 1);
 }
 else {
-   unsigned rgba[4];
-   rgba[0] = arg.clear_color.ui[0];
-   rgba[1] 

[Mesa-dev] [PATCH] llvmpipe: support GL_ARB_texture_buffer_object/GL_ARB_texture_buffer_range

2013-02-22 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This also fixes not honoring first/last_layer view parameters for array
textures, plus not honoring last_level view parameter for all textures
(neither is really used by OpenGL).
This mostly passes piglit arb_texture_buffer_object tests (it needs, however,
glsl 140 version override, plus GL 3.1 override, the latter only because
mesa does not allow ARB_tbo in non-core contexts).
Most arb_texture_buffer_object tests pass, with the exception of
arb_texture_buffer_object-formats. With arb parameter it passes most weirdo
formats before it segfaults in the state tracker, this looks to be some issue
with using legacy formats in core context (fails the same in softpipe).
With core parameter it passes with fs, however fails with vs (for most
formats). This will be fixed later (debugging shows we're completely missing
the shader recompile depending on format).
---
 src/gallium/auxiliary/draw/draw_context.c |4 +-
 src/gallium/auxiliary/draw/draw_context.h |2 +-
 src/gallium/auxiliary/draw/draw_llvm.c|6 +-
 src/gallium/auxiliary/draw/draw_llvm.h|2 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |4 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |8 +-
 src/gallium/drivers/llvmpipe/lp_jit.h |4 +-
 src/gallium/drivers/llvmpipe/lp_screen.c  |4 +-
 src/gallium/drivers/llvmpipe/lp_setup.c   |  134 ++---
 src/gallium/drivers/llvmpipe/lp_state_sampler.c   |   87 -
 src/gallium/drivers/llvmpipe/lp_texture.c |   22 
 src/gallium/drivers/llvmpipe/lp_texture.h |   21 
 12 files changed, 190 insertions(+), 108 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index dfa8927..c2b6851 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -799,7 +799,7 @@ draw_set_samplers(struct draw_context *draw,
 void
 draw_set_mapped_texture(struct draw_context *draw,
 unsigned shader_stage,
-unsigned sampler_idx,
+unsigned sv_idx,
 uint32_t width, uint32_t height, uint32_t depth,
 uint32_t first_level, uint32_t last_level,
 const void *base_ptr,
@@ -811,7 +811,7 @@ draw_set_mapped_texture(struct draw_context *draw,
 #ifdef HAVE_LLVM
   if (draw-llvm)
  draw_llvm_set_mapped_texture(draw,
-  sampler_idx,
+  sv_idx,
   width, height, depth, first_level,
   last_level, base_ptr,
   row_stride, img_stride, mip_offsets);
diff --git a/src/gallium/auxiliary/draw/draw_context.h 
b/src/gallium/auxiliary/draw/draw_context.h
index a4937b6..5928e75 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -151,7 +151,7 @@ draw_set_samplers(struct draw_context *draw,
 void
 draw_set_mapped_texture(struct draw_context *draw,
 unsigned shader_stage,
-unsigned sampler_idx,
+unsigned sv_idx,
 uint32_t width, uint32_t height, uint32_t depth,
 uint32_t first_level, uint32_t last_level,
 const void *base,
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 2467e5a..993f03c 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1497,7 +1497,7 @@ draw_llvm_dump_variant_key(struct draw_llvm_variant_key 
*key)
 
 void
 draw_llvm_set_mapped_texture(struct draw_context *draw,
- unsigned sampler_idx,
+ unsigned sv_idx,
  uint32_t width, uint32_t height, uint32_t depth,
  uint32_t first_level, uint32_t last_level,
  const void *base_ptr,
@@ -1508,9 +1508,9 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
unsigned j;
struct draw_jit_texture *jit_tex;
 
-   assert(sampler_idx  Elements(draw-llvm-jit_context.textures));
+   assert(sv_idx  Elements(draw-llvm-jit_context.textures));
 
-   jit_tex = draw-llvm-jit_context.textures[sampler_idx];
+   jit_tex = draw-llvm-jit_context.textures[sv_idx];
 
jit_tex-width = width;
jit_tex-height = height;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h 
b/src/gallium/auxiliary/draw/draw_llvm.h
index 17ca304..8f99f27 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -333,7 +333,7 @@ draw_llvm_set_sampler_state(struct draw_context *draw);
 
 void
 draw_llvm_set_mapped_texture(struct draw_context 

[Mesa-dev] [PATCH] draw: make sure pipeline is revalidated when sampler views or samplers change.

2013-02-22 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Since with llvm execution parts of sampler view and sampler state is baked into
the shader, we need to revalidate otherwise the wrong shader might get used.
(Not completely sure but I think this would not be required for non-llvm case,
along with everything else in these functions.)
This caused bugs in piglit arb_texture_buffer_object-formats, because we never
noticed that the view format changed.
---
 src/gallium/auxiliary/draw/draw_context.c |4 
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index c2b6851..528543b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -764,6 +764,8 @@ draw_set_sampler_views(struct draw_context *draw,
debug_assert(shader_stage  PIPE_SHADER_TYPES);
debug_assert(num = PIPE_MAX_SHADER_SAMPLER_VIEWS);
 
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
for (i = 0; i  num; ++i)
   draw-sampler_views[shader_stage][i] = views[i];
for (i = num; i  PIPE_MAX_SHADER_SAMPLER_VIEWS; ++i)
@@ -783,6 +785,8 @@ draw_set_samplers(struct draw_context *draw,
debug_assert(shader_stage  PIPE_SHADER_TYPES);
debug_assert(num = PIPE_MAX_SAMPLERS);
 
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
for (i = 0; i  num; ++i)
   draw-samplers[shader_stage][i] = samplers[i];
for (i = num; i  PIPE_MAX_SAMPLERS; ++i)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 5322397..ef0631c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -117,7 +117,8 @@ lp_sampler_static_texture_state(struct 
lp_static_texture_state *state,
state-level_zero_only   = !view-u.tex.last_level;
 
/*
-* FIXME: Handle the remainder of pipe_sampler_view.
+* the layer / element / level parameters are all either dynamic
+* state or handled transparently wrt execution.
 */
 }
 
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] gallium/docs: improve text about resources a bit.

2013-02-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This clarifies some things and gets rid of some old stuff.
The most significant one is probably that buffers cannot have formats
(nearly all drivers completely ignored format and used width0 as byte size
already in any case). There seems to be no use case for structured buffers.
(Note while d3d11 has new Structured Buffers, these still aren't associated
with a format, rather a byte stride, which we can't do yet either way.)
---
 src/gallium/docs/source/resources.rst |   62 ++---
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/src/gallium/docs/source/resources.rst 
b/src/gallium/docs/source/resources.rst
index c8a5766..553e335 100644
--- a/src/gallium/docs/source/resources.rst
+++ b/src/gallium/docs/source/resources.rst
@@ -34,44 +34,52 @@ will probably be advertised with an appropriate cap.
 TODO: document all targets. Note that both 3D and cube have restrictions
 that depend on the hardware generation.
 
-TODO: can buffers have a non-R8 format?
 
 PIPE_BUFFER
 ^^^
 
-Buffer resource: can be used as a vertex, index, constant buffer (appropriate 
bind flags must be requested).
+Buffer resource: can be used as a vertex, index, constant buffer
+(appropriate bind flags must be requested).
+
+Buffers do not really have a format, it's just bytes, but they are required
+to have their type set to a R8 format (without a specific just byte format,
+R8_UINT would probably make the most sense, but for historic reasons R8_UNORM
+is ok too). (This is just to make some shared buffer/texture code easier so
+format size can be queried.)
+width0 serves as size, most other resource properties don't apply but must be
+set appropriately (depth0/height0/array_size must be 1, last_level 0).
 
 They can be bound to stream output if supported.
 TODO: what about the restrictions lifted by the several later GL transform 
feedback extensions? How does one advertise that in Gallium?
 
-They can be also be bound to a shader stage as usual.
-TODO: are all drivers supposed to support this? how does this work exactly? 
are there size limits?
-
-They can be also be bound to the framebuffer as usual.
-TODO: are all drivers supposed to support this? how does this work exactly? 
are there size limits?
+They can be also be bound to a shader stage (for sampling) as usual by
+creating an appropriate sampler view, if the driver supports 
PIPE_CAP_TEXTURE_BUFFER_OBJECTS.
+This supports larger width than a 1d texture would
+(TODO limit currently unspecified, minimum must be at least 65536).
+Only the direct fetch sample opcodes are supported (TGSI_OPCODE_TXF,
+TGSI_OPCODE_SAMPLE_I) so the sampler state (coord wrapping etc.)
+is mostly ignored (with SAMPLE_I there's no sampler state at all).
+
+They can be also be bound to the framebuffer (only as color render target, not
+depth buffer, also there cannot be a depth buffer bound at the same time) as 
usual
+by creating an appropriate view (this is not usable in OpenGL).
+TODO there's no CAP bit currently for this, there's also unspecified size etc. 
limits
 TODO: is there any chance of supporting GL pixel buffer object acceleration 
with this?
 
-- depth0 must be 1
-- last_level must be 0
-- TODO: what about normalization?
-- TODO: wrap modes/other sampling state?
-- TODO: are arbitrary formats supported? in which cases?
 
 OpenGL: vertex buffers in GL 1.5 or GL_ARB_vertex_buffer_object
 
 - Binding to stream out requires GL 3.0 or GL_NV_transform_feedback
 - Binding as constant buffers requires GL 3.1 or GL_ARB_uniform_buffer_object
 - Binding to a sampling stage requires GL 3.1 or GL_ARB_texture_buffer_object
-- TODO: can they be bound to an FBO?
 
 D3D11: buffer resources
 - Binding to a render target requires D3D_FEATURE_LEVEL_10_0
 
-PIPE_TEXTURE_1D
-^^^
+PIPE_TEXTURE_1D / PIPE_TEXTURE_1D_ARRAY
+^^^
 1D surface accessed with normalized coordinates.
-
-UNIMPLEMENTED: 1D texture arrays not supported
+1D array textures are supported depending on PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS.
 
 - If PIPE_CAP_NPOT_TEXTURES is not supported,
   width must be a power of two
@@ -101,11 +109,10 @@ OpenCL: can create OpenCL images based on this, that can 
then be sampled arbitra
 
 D3D11: not supported (only PIPE_TEXTURE_2D with normalized coordinates is 
supported)
 
-PIPE_TEXTURE_2D
-^^^
+PIPE_TEXTURE_2D / PIPE_TEXTURE_2D_ARRAY
+^^^
 2D surface accessed with normalized coordinates.
-
-UNIMPLEMENTED: 2D texture arrays not supported
+2D array textures are supported depending on PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS.
 
 - If PIPE_CAP_NPOT_TEXTURES is not supported,
   width and height must be powers of two
@@ -142,18 +149,16 @@ D3D11: 3D textures
 
 - PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_10_0
 
-PIPE_TEXTURE_CUBE
-^
+PIPE_TEXTURE_CUBE / PIPE_TEXTURE_CUBE_ARRAY

[Mesa-dev] [PATCH 2/2] llvmpipe: simplify buffer allocation logic.

2013-02-21 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Now with buffer formats clarification don't need all that logic any longer.
(Note that it never would have worked in any case, because blockwidth and
blockheight were swapped any allocation with multi-byte format would have
had zero size.)
---
 src/gallium/drivers/llvmpipe/lp_texture.c |   12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 1c4f1dc..d55985b 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -311,13 +311,11 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
}
else {
   /* other data (vertex buffer, const buffer, etc) */
-  const enum pipe_format format = templat-format;
-  const uint w = templat-width0 / util_format_get_blockheight(format);
-  /* XXX buffers should only have one dimension, those values should be 1 
*/
-  const uint h = templat-height0 / util_format_get_blockwidth(format);
-  const uint d = templat-depth0;
-  const uint bpp = util_format_get_blocksize(format);
-  const uint bytes = w * h * d * bpp;
+  const uint bytes = templat-width0;
+  assert(util_format_get_blocksize(templat-format) == 1);
+  assert(templat-height0 == 1);
+  assert(templat-depth0 == 1);
+  assert(templat-last_level == 0);
   lpr-data = align_malloc(bytes, 16);
   if (!lpr-data)
  goto fail;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: make sure key size is calculated consistently.

2013-02-19 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Some parts calculated key size by using shader information, others by using
the pipe_vertex_element information. Since it is perfectly valid to have more
vertex_elements set than the vertex shader is using those may not be the same,
so we weren't copying over all vertex_element state - this caused the tgsi dump
to assert (iterates over all vertex elements). More importantly in this
situation it would also break vertex texturing completely (since the sampler
state derived from the key is at a different position than expected).
Fix thix by deriving key-nr_vertex_elements from the shader information
instead of the pipe_vertex_element state (unlike dx10, we can't have holes
in pipe_vertex_element state, so this should be safe).
(Note that actual llvm shader generation does not use the pipe_vertex_element
state from the key itself in any case (althogh I guess it could) but uses
the one from draw.pt (which should be the same though contains all elements)
instead.)
---
 src/gallium/auxiliary/draw/draw_llvm.c |   14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index f3b..2467e5a 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -420,8 +420,8 @@ draw_llvm_destroy(struct draw_llvm *llvm)
  */
 struct draw_llvm_variant *
 draw_llvm_create_variant(struct draw_llvm *llvm,
-unsigned num_inputs,
-const struct draw_llvm_variant_key *key)
+ unsigned num_inputs,
+ const struct draw_llvm_variant_key *key)
 {
struct draw_llvm_variant *variant;
struct llvm_vertex_shader *shader =
@@ -429,8 +429,8 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
LLVMTypeRef vertex_header;
 
variant = MALLOC(sizeof *variant +
-   shader-variant_key_size -
-   sizeof variant-key);
+shader-variant_key_size -
+sizeof variant-key);
if (variant == NULL)
   return NULL;
 
@@ -1415,8 +1415,12 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char 
*store)
 
/* Presumably all variants of the shader should have the same
 * number of vertex elements - ie the number of shader inputs.
+* NOTE: we NEED to store the needed number of needed inputs
+* here, not the number of provided elements to match keysize
+* (and the offset of sampler state in the key).
 */
-   key-nr_vertex_elements = llvm-draw-pt.nr_vertex_elements;
+   key-nr_vertex_elements = 
llvm-draw-vs.vertex_shader-info.file_max[TGSI_FILE_INPUT] + 1;
+   assert(key-nr_vertex_elements = llvm-draw-pt.nr_vertex_elements);
 
/* will have to rig this up properly later */
key-clip_xy = llvm-draw-clip_xy;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: lp_resource_copy cleanup

2013-02-19 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We don't need to flush resources for each layer, and since we don't actually
care about layer at all in the flush function just drop the parameter.
Also we can use util_copy_box instead of repeated util_copy_rect.
---
 src/gallium/drivers/llvmpipe/lp_flush.c   |3 +-
 src/gallium/drivers/llvmpipe/lp_flush.h   |1 -
 src/gallium/drivers/llvmpipe/lp_surface.c |   87 +++--
 src/gallium/drivers/llvmpipe/lp_texture.c |3 +-
 src/gallium/drivers/llvmpipe/lp_texture.h |2 +-
 5 files changed, 47 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c 
b/src/gallium/drivers/llvmpipe/lp_flush.c
index 964b792..cbfe564 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -98,7 +98,6 @@ boolean
 llvmpipe_flush_resource(struct pipe_context *pipe,
 struct pipe_resource *resource,
 unsigned level,
-int layer,
 boolean read_only,
 boolean cpu_access,
 boolean do_not_block,
@@ -106,7 +105,7 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
 {
unsigned referenced;
 
-   referenced = llvmpipe_is_resource_referenced(pipe, resource, level, layer);
+   referenced = llvmpipe_is_resource_referenced(pipe, resource, level);
 
if ((referenced  LP_REFERENCED_FOR_WRITE) ||
((referenced  LP_REFERENCED_FOR_READ)  !read_only)) {
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h 
b/src/gallium/drivers/llvmpipe/lp_flush.h
index efff94c..bc1e2a8 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.h
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -47,7 +47,6 @@ boolean
 llvmpipe_flush_resource(struct pipe_context *pipe,
 struct pipe_resource *resource,
 unsigned level,
-int layer,
 boolean read_only,
 boolean cpu_access,
 boolean do_not_block,
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c 
b/src/gallium/drivers/llvmpipe/lp_surface.c
index dbaed95..a83a903 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -57,14 +57,12 @@ lp_resource_copy(struct pipe_context *pipe,
  struct pipe_resource *src, unsigned src_level,
  const struct pipe_box *src_box)
 {
-   /* XXX this used to ignore srcz/dstz
-* assume it works the same for cube and 3d
-*/
struct llvmpipe_resource *src_tex = llvmpipe_resource(src);
struct llvmpipe_resource *dst_tex = llvmpipe_resource(dst);
const enum pipe_format format = src_tex-base.format;
unsigned width = src_box-width;
unsigned height = src_box-height;
+   unsigned depth = src_box-depth;
unsigned z;
 
/* Fallback for buffers. */
@@ -74,27 +72,28 @@ lp_resource_copy(struct pipe_context *pipe,
   return;
}
 
+   llvmpipe_flush_resource(pipe,
+   dst, dst_level,
+   FALSE, /* read_only */
+   TRUE, /* cpu_access */
+   FALSE, /* do_not_block */
+   blit dest);
+
+   llvmpipe_flush_resource(pipe,
+   src, src_level,
+   TRUE, /* read_only */
+   TRUE, /* cpu_access */
+   FALSE, /* do_not_block */
+   blit src);
+
+   /*
+   printf(surface copy from %u lvl %u to %u lvl %u: %u,%u,%u to %u,%u,%u %u x 
%u x %u\n,
+  src_tex-id, src_level, dst_tex-id, dst_level,
+  src_box-x, src_box-y, src_box-z, dstx, dsty, dstz,
+  src_box-width, src_box-height, src_box-depth);
+   */
+
for (z = 0; z  src_box-depth; z++){
-  llvmpipe_flush_resource(pipe,
-  dst, dst_level, dstz + z,
-  FALSE, /* read_only */
-  TRUE, /* cpu_access */
-  FALSE, /* do_not_block */
-  blit dest);
-
-  llvmpipe_flush_resource(pipe,
-  src, src_level, src_box-z + z,
-  TRUE, /* read_only */
-  TRUE, /* cpu_access */
-  FALSE, /* do_not_block */
-  blit src);
-
-  /*
-  printf(surface copy from %u lvl %u to %u lvl %u: %u,%u,%u to %u,%u,%u 
%u x %u x %u\n,
- src_tex-id, src_level, dst_tex-id, dst_level,
- src_box-x, src_box-y, src_box-z, dstx, dsty, dstz,
- src_box-width, src_box-height, src_box-depth);
-  */
 
   /* set src tiles to linear layout */
   {
@@ -148,27 +147,29 @@ lp_resource_copy(struct pipe_context *pipe,
 }
  }
   }
+   }
 
- 

[Mesa-dev] [PATCH] gallivm: fix indirect src register fetches requiring bitcast

2013-02-19 Thread sroland
From: Roland Scheidegger srol...@vmware.com

For constant and temporary register fetches, the bitcasts weren't done
correctly for the indirect case, leading to crashes due to type mismatches.
Simply do the bitcasts after fetching (much simpler than fixing up the load
pointer for the various cases).

This fixes https://bugs.freedesktop.org/show_bug.cgi?id=61036
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |   37 ++-
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index ae4a577..69957fe 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -603,10 +603,10 @@ emit_fetch_constant(
LLVMBuilderRef builder = gallivm-builder;
struct lp_build_context *uint_bld = bld_base-uint_bld;
LLVMValueRef indirect_index = NULL;
-   struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
unsigned dimension = 0;
LLVMValueRef dimension_index;
LLVMValueRef consts_ptr;
+   LLVMValueRef res;
 
/* XXX: Handle fetching xyzw components as a vector */
assert(swizzle != ~0);
@@ -637,7 +637,7 @@ emit_fetch_constant(
   index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
 
   /* Gather values from the constant buffer */
-  return build_gather(bld_fetch, consts_ptr, index_vec);
+  res = build_gather(bld_base-base, consts_ptr, index_vec);
}
else {
   LLVMValueRef index;  /* index into the const buffer */
@@ -646,18 +646,16 @@ emit_fetch_constant(
   index = lp_build_const_int32(gallivm, reg-Register.Index*4 + swizzle);
 
   scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
-   index, 1, );
-
-  if (stype != TGSI_TYPE_FLOAT  stype != TGSI_TYPE_UNTYPED) {
- LLVMTypeRef ivtype = 
LLVMPointerType(LLVMInt32TypeInContext(gallivm-context), 0);
- LLVMValueRef temp_ptr;
- temp_ptr = LLVMBuildBitCast(builder, scalar_ptr, ivtype, );
- scalar = LLVMBuildLoad(builder, temp_ptr, );
-  } else
- scalar = LLVMBuildLoad(builder, scalar_ptr, );
+index, 1, );
+  scalar = LLVMBuildLoad(builder, scalar_ptr, );
+  res = lp_build_broadcast_scalar(bld_base-base, scalar);
+   }
 
-  return lp_build_broadcast_scalar(bld_fetch, scalar);
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+  struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+  res = LLVMBuildBitCast(builder, res, bld_fetch-vec_type, );
}
+   return res;
 }
 
 static LLVMValueRef
@@ -791,16 +789,13 @@ emit_fetch_temporary(
}
else {
   LLVMValueRef temp_ptr;
-  if (stype != TGSI_TYPE_FLOAT  stype != TGSI_TYPE_UNTYPED) {
- LLVMTypeRef itype = LLVMPointerType(bld-bld_base.int_bld.vec_type, 
0);
- LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index,
- swizzle);
- temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, );
-  } else
- temp_ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, swizzle);
+  temp_ptr = lp_get_temp_ptr_soa(bld, reg-Register.Index, swizzle);
   res = LLVMBuildLoad(builder, temp_ptr, );
-  if (!res)
- return bld-bld_base.base.undef;
+   }
+
+   if (stype == TGSI_TYPE_SIGNED || stype == TGSI_TYPE_UNSIGNED) {
+  struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
+  res = LLVMBuildBitCast(builder, res, bld_fetch-vec_type, );
}
 
return res;
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: make sure key size is calculated consistently.

2013-02-18 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Some parts calculated key size by using shader information, others by using
the pipe_vertex_element information. Since it is perfectly valid to have more
vertex_elements set than the vertex shader is using those may not be the same,
so we weren't copying over all vertex_element state - this caused the tgsi dump
to assert (iterates over all vertex elements). With some luck it didn't
crash otherwise even though the llvm generate_fetch code also iterates over
all vertex elements (probably because llvm threw away the unused inputs anyway),
but if in this situation vertex texturing would be used things would definitely
go wrong (as the sampler information wouldn't be copied).
So drop the key size calculation using shader information.
---
 src/gallium/auxiliary/draw/draw_llvm.c |   13 -
 src/gallium/auxiliary/draw/draw_llvm.h |1 -
 .../draw/draw_pt_fetch_shade_pipeline_llvm.c   |7 ++-
 src/gallium/auxiliary/draw/draw_vs_llvm.c  |6 --
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index f3b..df57358 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -420,17 +420,20 @@ draw_llvm_destroy(struct draw_llvm *llvm)
  */
 struct draw_llvm_variant *
 draw_llvm_create_variant(struct draw_llvm *llvm,
-unsigned num_inputs,
-const struct draw_llvm_variant_key *key)
+ unsigned num_inputs,
+ const struct draw_llvm_variant_key *key)
 {
struct draw_llvm_variant *variant;
struct llvm_vertex_shader *shader =
   llvm_vertex_shader(llvm-draw-vs.vertex_shader);
LLVMTypeRef vertex_header;
+   unsigned key_size = draw_llvm_variant_key_size(key-nr_vertex_elements,
+  MAX2(key-nr_samplers,
+   key-nr_sampler_views));
 
variant = MALLOC(sizeof *variant +
-   shader-variant_key_size -
-   sizeof variant-key);
+key_size -
+sizeof variant-key);
if (variant == NULL)
   return NULL;
 
@@ -440,7 +443,7 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
 
create_jit_types(variant);
 
-   memcpy(variant-key, key, shader-variant_key_size);
+   memcpy(variant-key, key, key_size);
 
vertex_header = create_jit_vertex_header(variant-gallivm, num_inputs);
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h 
b/src/gallium/auxiliary/draw/draw_llvm.h
index 17ca304..b20cee5 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -281,7 +281,6 @@ struct draw_llvm_variant
 struct llvm_vertex_shader {
struct draw_vertex_shader base;
 
-   unsigned variant_key_size;
struct draw_llvm_variant_list_item variants;
unsigned variants_created;
unsigned variants_cached;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c 
b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index b0c18ed..d7f855f 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -127,13 +127,18 @@ llvm_middle_end_prepare( struct draw_pt_middle_end 
*middle,
   struct llvm_vertex_shader *shader = llvm_vertex_shader(vs);
   char store[DRAW_LLVM_MAX_VARIANT_KEY_SIZE];
   unsigned i;
+  unsigned key_size;
 
   key = draw_llvm_make_variant_key(fpme-llvm, store);
 
+  key_size = draw_llvm_variant_key_size(key-nr_vertex_elements,
+MAX2(key-nr_samplers,
+ key-nr_sampler_views));
+
   /* Search shader's list of variants for the key */
   li = first_elem(shader-variants);
   while (!at_end(shader-variants, li)) {
- if (memcmp(li-base-key, key, shader-variant_key_size) == 0) {
+ if (memcmp(li-base-key, key, key_size) == 0) {
 variant = li-base;
 break;
  }
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c 
b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index ac3999e..50cef79 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -98,12 +98,6 @@ draw_create_vs_llvm(struct draw_context *draw,
 
tgsi_scan_shader(state-tokens, vs-base.info);
 
-   vs-variant_key_size = 
-  draw_llvm_variant_key_size(
- vs-base.info.file_max[TGSI_FILE_INPUT]+1,
- MAX2(vs-base.info.file_max[TGSI_FILE_SAMPLER]+1,
-  vs-base.info.file_max[TGSI_FILE_SAMPLER_VIEW]+1));
-
vs-base.state.stream_output = state-stream_output;
vs-base.draw = draw;
vs-base.prepare = vs_llvm_prepare;
-- 
1.7.9.5


[Mesa-dev] [PATCH] gallivm/tgsi: fix src modifier fetching with non-float types.

2013-02-15 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Need to take the type into account. Also, if we want to allow
mov's with modifiers we need to pick a type (assume float).

v2: don't allow all modifiers on all type, in particular don't allow
absolute on non-float types and don't allow negate on unsigned.
Also treat UADD as signed (despite the name) since it is used
for handling both signed and unsigned integer arguments and otherwise
modifiers don't work.
Also add tgsi docs clarifying this.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c |   36 +--
 src/gallium/auxiliary/tgsi/tgsi_exec.c  |2 +-
 src/gallium/auxiliary/tgsi/tgsi_info.c  |6 +++--
 src/gallium/docs/source/tgsi.rst|   15 +++
 4 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index a4fea7d..b97c766 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -311,11 +311,43 @@ lp_build_emit_fetch(
}
 
if (reg-Register.Absolute) {
-  res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res);
+  switch (stype) {
+  case TGSI_TYPE_FLOAT:
+  case TGSI_TYPE_DOUBLE:
+  case TGSI_TYPE_UNTYPED:
+  /* modifiers on movs assume data is float */
+ res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res);
+ break;
+  case TGSI_TYPE_UNSIGNED:
+  case TGSI_TYPE_SIGNED:
+  case TGSI_TYPE_VOID:
+  default:
+ /* abs modifier is only legal on floating point types */
+ assert(0);
+ break;
+  }
}
 
if (reg-Register.Negate) {
-  res = lp_build_negate( bld_base-base, res );
+  switch (stype) {
+  case TGSI_TYPE_FLOAT:
+  case TGSI_TYPE_UNTYPED:
+ /* modifiers on movs assume data is float */
+ res = lp_build_negate( bld_base-base, res );
+ break;
+  case TGSI_TYPE_DOUBLE:
+ /* no double build context */
+ assert(0);
+ break;
+  case TGSI_TYPE_SIGNED:
+ res = lp_build_negate( bld_base-int_bld, res );
+ break;
+  case TGSI_TYPE_UNSIGNED:
+  case TGSI_TYPE_VOID:
+  default:
+ assert(0);
+ break;
+  }
}
 
/*
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 03f1942..1099d06 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -4150,7 +4150,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_UADD:
-  exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, 
TGSI_EXEC_DATA_UINT);
+  exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, 
TGSI_EXEC_DATA_INT);
   break;
 
case TGSI_OPCODE_UDIV:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c 
b/src/gallium/auxiliary/tgsi/tgsi_info.c
index f289ebc..9c6fdfc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -277,9 +277,9 @@ tgsi_opcode_infer_src_type( uint opcode )
case TGSI_OPCODE_AND:
case TGSI_OPCODE_OR:
case TGSI_OPCODE_XOR:
+   /* XXX some src args may be signed for SAD ? */
case TGSI_OPCODE_SAD:
case TGSI_OPCODE_U2F:
-   case TGSI_OPCODE_UADD:
case TGSI_OPCODE_UDIV:
case TGSI_OPCODE_UMOD:
case TGSI_OPCODE_UMAD:
@@ -310,6 +310,8 @@ tgsi_opcode_infer_src_type( uint opcode )
case TGSI_OPCODE_IABS:
case TGSI_OPCODE_ISSG:
case TGSI_OPCODE_UARL:
+   /* UADD is both signed and unsigned require signed for working modifiers */
+   case TGSI_OPCODE_UADD:
   return TGSI_TYPE_SIGNED;
default:
   return TGSI_TYPE_FLOAT;
@@ -331,7 +333,6 @@ tgsi_opcode_infer_dst_type( uint opcode )
case TGSI_OPCODE_OR:
case TGSI_OPCODE_XOR:
case TGSI_OPCODE_SAD:
-   case TGSI_OPCODE_UADD:
case TGSI_OPCODE_UDIV:
case TGSI_OPCODE_UMOD:
case TGSI_OPCODE_UMAD:
@@ -362,6 +363,7 @@ tgsi_opcode_infer_dst_type( uint opcode )
case TGSI_OPCODE_ARR:
case TGSI_OPCODE_IABS:
case TGSI_OPCODE_ISSG:
+   case TGSI_OPCODE_UADD:
   return TGSI_TYPE_SIGNED;
default:
   return TGSI_TYPE_FLOAT;
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index dd4c773..d9a7fe9 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -23,6 +23,21 @@ When an instruction has a scalar result, the result is 
usually copied into
 each of the components of *dst*. When this happens, the result is said to be
 *replicated* to *dst*. :opcode:`RCP` is one such instruction.
 
+Modifiers
+^^^
+
+TGSI supports modifiers on inputs (as well as saturate modifier on 
instructions).
+
+For inputs which have a floating point type, both absolute value and negation
+modifiers are supported (with absolute value being applied first).
+TGSI_OPCODE_MOV is considered to have float input type for applying 

[Mesa-dev] [PATCH 1/3] gallivm: DIV shouldn't be deprecated.

2013-02-14 Thread sroland
From: Roland Scheidegger srol...@vmware.com

(Though it looks glsl won't emit it.)
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c |1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index a4fea7d..53c81bd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -211,7 +211,6 @@ lp_build_tgsi_inst_llvm(
case TGSI_OPCODE_X2D:
case TGSI_OPCODE_ARA:
case TGSI_OPCODE_BRA:
-   case TGSI_OPCODE_DIV:
case TGSI_OPCODE_PUSHA:
case TGSI_OPCODE_POPA:
case TGSI_OPCODE_SAD:
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] gallivm: fix src modifier fetching with non-float types.

2013-02-14 Thread sroland
From: Roland Scheidegger srol...@vmware.com

Need to take the type into account. Also, if we want to allow
mov's with modifiers we need to pick a type (assume float).
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c |   54 ++-
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 53c81bd..00a493a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -310,11 +310,61 @@ lp_build_emit_fetch(
}
 
if (reg-Register.Absolute) {
-  res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res);
+  switch (stype) {
+  case TGSI_TYPE_FLOAT:
+  case TGSI_TYPE_DOUBLE:
+  case TGSI_TYPE_UNTYPED:
+ /*
+  * modifiers on movs assume data is float
+  */
+ res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_ABS, res);
+ break;
+  case TGSI_TYPE_UNSIGNED:
+  case TGSI_TYPE_SIGNED:
+ /*
+  * XXX note we cannot effectively distinguish between signed and 
unsigned,
+  * since some opcodes (like uadd) are used for both signed and 
unsigned
+  * source operands. Hence this always assumes signed numbers.
+  * (May revisit this by using signed type for such opcodes?)
+  */
+ res = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_IABS, res);
+ break;
+  case TGSI_TYPE_VOID:
+  default:
+ /* dunno how that should work if legal just ignore? */
+ assert(0);
+ break;
+  }
}
 
if (reg-Register.Negate) {
-  res = lp_build_negate( bld_base-base, res );
+  switch (stype) {
+  case TGSI_TYPE_FLOAT:
+  case TGSI_TYPE_UNTYPED:
+ /*
+  * modifiers on movs assume data is float
+  */
+ res = lp_build_negate( bld_base-base, res );
+ break;
+  case TGSI_TYPE_DOUBLE:
+ /* no double build context */
+ assert(0);
+ break;
+  case TGSI_TYPE_UNSIGNED:
+  case TGSI_TYPE_SIGNED:
+ /*
+  * like above, cannot distinguish signed and unsigned.
+  * However, in any case it looks like we probably should return
+  * two's complement in any case.
+  */
+ res = lp_build_negate( bld_base-int_bld, res );
+ break;
+  case TGSI_TYPE_VOID:
+  default:
+ /* dunno how that should work if legal just ignore? */
+ assert(0);
+ break;
+  }
}
 
/*
-- 
1.7.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] gallivm/tgsi: fix issues with sample opcodes

2013-02-14 Thread sroland
From: Roland Scheidegger srol...@vmware.com

We need to encode them as Texture instructions since the NumOffsets field
is encoded there. However, we don't encode the actual target in there, this
is derived from the sampler view src later.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h  |2 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c |   60 +++
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c  |  110 
 src/gallium/auxiliary/tgsi/tgsi_info.c   |2 +-
 src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h |   38 +--
 src/gallium/auxiliary/tgsi/tgsi_text.c   |   11 ++
 src/gallium/auxiliary/tgsi/tgsi_ureg.h   |  119 ++
 7 files changed, 247 insertions(+), 95 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 3446a8f..dac97c3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -374,6 +374,8 @@ struct lp_build_tgsi_soa_context
 
const struct lp_build_sampler_soa *sampler;
 
+   struct tgsi_declaration_sampler_view sv[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+
LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][TGSI_NUM_CHANNELS];
LLVMValueRef temps[LP_MAX_TGSI_TEMPS][TGSI_NUM_CHANNELS];
LLVMValueRef addr[LP_MAX_TGSI_ADDRS][TGSI_NUM_CHANNELS];
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index 3a19fe2..a4caf78 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -188,7 +188,8 @@ analyse_tex(struct analysis_context *ctx,
 static void
 analyse_sample(struct analysis_context *ctx,
const struct tgsi_full_instruction *inst,
-   enum lp_build_tex_modifier modifier)
+   enum lp_build_tex_modifier modifier,
+   boolean shadow)
 {
struct lp_tgsi_info *info = ctx-info;
unsigned chan;
@@ -197,45 +198,14 @@ analyse_sample(struct analysis_context *ctx,
   struct lp_tgsi_texture_info *tex_info = info-tex[info-num_texs];
   boolean indirect = FALSE;
   boolean shadow = FALSE;
-  unsigned readmask = 0;
+  unsigned readmask;
 
-  tex_info-target = inst-Texture.Texture;
-  switch (inst-Texture.Texture) {
-  case TGSI_TEXTURE_SHADOW1D:
- shadow = TRUE;
- /* Fallthrough */
-  case TGSI_TEXTURE_1D:
- readmask = TGSI_WRITEMASK_X;
- break;
-  case TGSI_TEXTURE_SHADOW1D_ARRAY:
-  case TGSI_TEXTURE_SHADOW2D:
-  case TGSI_TEXTURE_SHADOWRECT:
- shadow = TRUE;
- /* Fallthrough */
-  case TGSI_TEXTURE_1D_ARRAY:
-  case TGSI_TEXTURE_2D:
-  case TGSI_TEXTURE_RECT:
- readmask = TGSI_WRITEMASK_XY;
- break;
-  case TGSI_TEXTURE_SHADOW2D_ARRAY:
-  case TGSI_TEXTURE_SHADOWCUBE:
- shadow = TRUE;
- /* Fallthrough */
-  case TGSI_TEXTURE_2D_ARRAY:
-  case TGSI_TEXTURE_3D:
-  case TGSI_TEXTURE_CUBE:
- readmask = TGSI_WRITEMASK_XYZ;
- break;
-  case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
- shadow = TRUE;
- /* Fallthrough */
-  case TGSI_TEXTURE_CUBE_ARRAY:
- readmask = TGSI_WRITEMASK_XYZW;
- break;
-  default:
- assert(0);
- return;
-  }
+  /*
+   * We don't really get much information here, in particular not
+   * the target info, hence no useful writemask neither. Maybe should just
+   * forget the whole function.
+   */
+  readmask = TGSI_WRITEMASK_XYZW;
 
   tex_info-texture_unit = inst-Src[1].Register.Index;
   tex_info-sampler_unit = inst-Src[2].Register.Index;
@@ -327,20 +297,22 @@ analyse_instruction(struct analysis_context *ctx,
  analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_PROJECTED);
  break;
   case TGSI_OPCODE_SAMPLE:
+ analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_NONE, FALSE);
+ break;
   case TGSI_OPCODE_SAMPLE_C:
- analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_NONE);
+ analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_NONE, TRUE);
  break;
   case TGSI_OPCODE_SAMPLE_C_LZ:
- analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_ZERO);
+ analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_ZERO, TRUE);
  break;
   case TGSI_OPCODE_SAMPLE_D:
- analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV);
+ analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, FALSE);
  break;
   case TGSI_OPCODE_SAMPLE_B:
- analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS);
+ analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, FALSE);
  break;
   case TGSI_OPCODE_SAMPLE_L:
- analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD);
+ analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, FALSE);
 

[Mesa-dev] [PATCH] gallium: fix tgsi SAMPLE_L opcode to use separate source for explicit lod

2013-02-11 Thread sroland
From: Roland Scheidegger srol...@vmware.com

It looks like using coord.w as explicit lod value is a mistake, most likely
because some dx10 docs had it specified that way. Seems this was changed though:
http://msdn.microsoft.com/en-us/library/windows/desktop/hh447229%28v=vs.85%29.aspx
- let's just hope it doesn't depend on runtime build version or something.
Not only would this need translation (so go against the stated goal these
opcodes should be close to dx10 semantics) but it would prevent usage of this
opcode with cube arrays, which is apparently possible:
http://msdn.microsoft.com/en-us/library/windows/desktop/bb509699%28v=vs.85%29.aspx
(Note not only does this show cube arrays using explicit lod, but also the
confusion with this opcode: it lists an explicit lod parameter value, but then
states last component of location is used as lod).
(For true hw drivers, only nv50 had code to handle it, and it appears the
code was already right for the new semantics, though fix up the seemingly
wrong c/d arguments while there.)
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c   |5 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c|2 +-
 src/gallium/auxiliary/tgsi/tgsi_exec.c |2 +-
 src/gallium/auxiliary/tgsi/tgsi_info.c |2 +-
 src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h   |2 +-
 src/gallium/docs/source/tgsi.rst   |   12 ++--
 .../drivers/nv50/codegen/nv50_ir_from_tgsi.cpp |2 +-
 .../state_trackers/d3d1x/gd3d1x/sm4_to_tgsi.cpp|9 ++---
 8 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index cb6564a..3a19fe2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -241,13 +241,10 @@ analyse_sample(struct analysis_context *ctx,
   tex_info-sampler_unit = inst-Src[2].Register.Index;
 
   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV ||
+  modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD ||
   modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS || shadow) {
  /* We don't track insts with additional regs, although we could */
  indirect = TRUE;
-  }  else {
- if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-readmask |= TGSI_WRITEMASK_W;
- }
   }
 
   for (chan = 0; chan  4; ++chan) {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 52a60dd..f816103 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1430,7 +1430,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
   /* lod bias comes from src 3.r but explicit lod from 0.a */
   lod_bias = NULL;
-  explicit_lod = lp_build_emit_fetch( bld-bld_base, inst, 0, 3 );
+  explicit_lod = lp_build_emit_fetch( bld-bld_base, inst, 3, 0 );
}
else if (modifier == LP_BLD_TEX_MODIFIER_LOD_ZERO) {
   lod_bias = NULL;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 6da7d42..03f1942 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2154,7 +2154,7 @@ exec_sample(struct tgsi_exec_machine *mach,
  control = tgsi_sampler_lod_bias;
   }
   else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
- FETCH(c1, 0, TGSI_CHAN_W);
+ FETCH(c1, 3, TGSI_CHAN_X);
  lod = c1;
  control = tgsi_sampler_lod_explicit;
   }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c 
b/src/gallium/auxiliary/tgsi/tgsi_info.c
index f8a3cb6..f289ebc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -190,7 +190,7 @@ static const struct tgsi_opcode_info 
opcode_info[TGSI_OPCODE_LAST] =
{ 1, 4, 0, 0, 0, 0, OTHR, SAMPLE_C,TGSI_OPCODE_SAMPLE_C },
{ 1, 4, 0, 0, 0, 0, OTHR, SAMPLE_C_LZ, TGSI_OPCODE_SAMPLE_C_LZ },
{ 1, 5, 0, 0, 0, 0, OTHR, SAMPLE_D,TGSI_OPCODE_SAMPLE_D },
-   { 1, 3, 0, 0, 0, 0, OTHR, SAMPLE_L,TGSI_OPCODE_SAMPLE_L },
+   { 1, 4, 0, 0, 0, 0, OTHR, SAMPLE_L,TGSI_OPCODE_SAMPLE_L },
{ 1, 3, 0, 0, 0, 0, OTHR, GATHER4, TGSI_OPCODE_GATHER4 },
{ 1, 2, 0, 0, 0, 0, OTHR, SVIEWINFO,   TGSI_OPCODE_SVIEWINFO },
{ 1, 2, 0, 0, 0, 0, OTHR, SAMPLE_POS,  TGSI_OPCODE_SAMPLE_POS },
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h 
b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index 75e27a6..4a1b811 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -176,7 +176,7 @@ OP14(SAMPLE_B)
 OP14(SAMPLE_C)
 OP14(SAMPLE_C_LZ)
 OP15(SAMPLE_D)
-OP13(SAMPLE_L)
+OP14(SAMPLE_L)
 OP13(GATHER4)
 OP12(SVIEWINFO)
 OP13(SAMPLE_POS)
diff --git 

[Mesa-dev] [PATCH 1/2] llvmpipe: first steps of adding dual source blend support

2013-02-07 Thread sroland
From: Roland Scheidegger srol...@vmware.com

This adds support of the additional blending factors to the blend function
itself, and also enables testing of it in lp_test_blend (which passes).
Still need to add the glue code of linking fs shader outputs to blend inputs
in llvmpipe, and probably need to add special handling if destination doesn't
include alpha (which lp_test_blend doesn't test).
---
 src/gallium/drivers/llvmpipe/lp_bld_blend.h |1 +
 src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |   12 ++--
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |1 +
 src/gallium/drivers/llvmpipe/lp_test_blend.c|   70 ---
 4 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h 
b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index 394935b..4bd2867 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -61,6 +61,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
unsigned rt,
LLVMValueRef src,
LLVMValueRef src_alpha,
+   LLVMValueRef src1,
LLVMValueRef dst,
LLVMValueRef mask,
LLVMValueRef const_,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c 
b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index ccdf852..8e9e7fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -69,6 +69,7 @@ struct lp_build_blend_aos_context
 
LLVMValueRef src;
LLVMValueRef src_alpha;
+   LLVMValueRef src1;
LLVMValueRef dst;
LLVMValueRef const_;
LLVMValueRef const_alpha;
@@ -123,9 +124,7 @@ lp_build_blend_factor_unswizzled(struct 
lp_build_blend_aos_context *bld,
   return const_alpha;
case PIPE_BLENDFACTOR_SRC1_COLOR:
case PIPE_BLENDFACTOR_SRC1_ALPHA:
-  /* TODO */
-  assert(0);
-  return bld-base.zero;
+  return bld-src1;
case PIPE_BLENDFACTOR_INV_SRC_COLOR:
   if(!bld-inv_src)
  bld-inv_src = lp_build_comp(bld-base, bld-src);
@@ -149,9 +148,7 @@ lp_build_blend_factor_unswizzled(struct 
lp_build_blend_aos_context *bld,
   return bld-inv_const_alpha;
case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-  /* TODO */
-  assert(0);
-  return bld-base.zero;
+  return lp_build_comp(bld-base, bld-src1);
default:
   assert(0);
   return bld-base.zero;
@@ -268,6 +265,7 @@ lp_build_blend_factor(struct lp_build_blend_aos_context 
*bld,
  * @param type  data type of the pixel vector
  * @param rtrender target index
  * @param src   blend src
+ * @param src1  second blend src (for dual source blend)
  * @param dst   blend dst
  * @param mask  optional mask to apply to the blending result
  * @param const_const blend color
@@ -283,6 +281,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
unsigned rt,
LLVMValueRef src,
LLVMValueRef src_alpha,
+   LLVMValueRef src1,
LLVMValueRef dst,
LLVMValueRef mask,
LLVMValueRef const_,
@@ -304,6 +303,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
memset(bld, 0, sizeof bld);
lp_build_context_init(bld.base, gallivm, type);
bld.src = src;
+   bld.src1 = src1;
bld.dst = dst;
bld.const_ = const_;
bld.src_alpha = src_alpha;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 90a67e6..2b31c14 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -1693,6 +1693,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
   rt,
   src[i],
   has_alpha ? NULL : src_alpha[i],
+  NULL,
   dst[i],
   partial_mask ? src_mask[i] : NULL,
   blend_color,
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c 
b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 754434d..6faaedf 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -45,7 +45,8 @@
 #include lp_test.h
 
 
-typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void 
*con, void *res);
+typedef void (*blend_test_ptr_t)(const void *src, const void *src1,
+ const void *dst, const void *con, void *res);
 
 
 void
@@ -138,9 +139,10 @@ add_blend_test(struct gallivm_state *gallivm,
LLVMModuleRef module = gallivm-module;
LLVMContextRef context = gallivm-context;
LLVMTypeRef vec_type;
-   LLVMTypeRef args[4];
+   

[Mesa-dev] [PATCH 2/2] llvmpipe: implement dual source blending

2013-02-07 Thread sroland
From: Roland Scheidegger srol...@vmware.com

link up the fs outputs and blend inputs, and make sure the second blend source
is correctly loaded and converted (which is quite complex).
There's a slight refactoring of the monster generate_unswizzled_blend()
function where it makes sense to factor out alpha conversion (which needs
to run twice for dual source blend).
---
 src/gallium/drivers/llvmpipe/lp_bld_blend.h |1 +
 src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |   13 +-
 src/gallium/drivers/llvmpipe/lp_screen.c|2 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  331 +--
 src/gallium/drivers/llvmpipe/lp_test_blend.c|3 +-
 5 files changed, 257 insertions(+), 93 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h 
b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index 4bd2867..249a345 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -62,6 +62,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
LLVMValueRef src,
LLVMValueRef src_alpha,
LLVMValueRef src1,
+   LLVMValueRef src1_alpha,
LLVMValueRef dst,
LLVMValueRef mask,
LLVMValueRef const_,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c 
b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index 8e9e7fe..c4d04a2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -70,6 +70,7 @@ struct lp_build_blend_aos_context
LLVMValueRef src;
LLVMValueRef src_alpha;
LLVMValueRef src1;
+   LLVMValueRef src1_alpha;
LLVMValueRef dst;
LLVMValueRef const_;
LLVMValueRef const_alpha;
@@ -94,6 +95,7 @@ lp_build_blend_factor_unswizzled(struct 
lp_build_blend_aos_context *bld,
  boolean alpha)
 {
LLVMValueRef src_alpha = bld-src_alpha ? bld-src_alpha : bld-src;
+   LLVMValueRef src1_alpha = bld-src1_alpha ? bld-src1_alpha : bld-src1;
LLVMValueRef const_alpha = bld-const_alpha ? bld-const_alpha : 
bld-const_;
 
switch (factor) {
@@ -123,8 +125,9 @@ lp_build_blend_factor_unswizzled(struct 
lp_build_blend_aos_context *bld,
case PIPE_BLENDFACTOR_CONST_ALPHA:
   return const_alpha;
case PIPE_BLENDFACTOR_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:
   return bld-src1;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+  return src1_alpha;
case PIPE_BLENDFACTOR_INV_SRC_COLOR:
   if(!bld-inv_src)
  bld-inv_src = lp_build_comp(bld-base, bld-src);
@@ -147,8 +150,9 @@ lp_build_blend_factor_unswizzled(struct 
lp_build_blend_aos_context *bld,
  bld-inv_const_alpha = lp_build_comp(bld-base, const_alpha);
   return bld-inv_const_alpha;
case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
   return lp_build_comp(bld-base, bld-src1);
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+  return lp_build_comp(bld-base, src1_alpha);
default:
   assert(0);
   return bld-base.zero;
@@ -265,10 +269,13 @@ lp_build_blend_factor(struct lp_build_blend_aos_context 
*bld,
  * @param type  data type of the pixel vector
  * @param rtrender target index
  * @param src   blend src
+ * @param src_alpha blend src alpha (if not included in src)
  * @param src1  second blend src (for dual source blend)
+ * @param src1_alphasecond blend src alpha (if not included in src1)
  * @param dst   blend dst
  * @param mask  optional mask to apply to the blending result
  * @param const_const blend color
+ * @param const_alpha   const blend color alpha (if not included in const_)
  * @param swizzle   swizzle values for RGBA
  *
  * @return the result of blending src and dst
@@ -282,6 +289,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
LLVMValueRef src,
LLVMValueRef src_alpha,
LLVMValueRef src1,
+   LLVMValueRef src1_alpha,
LLVMValueRef dst,
LLVMValueRef mask,
LLVMValueRef const_,
@@ -307,6 +315,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
bld.dst = dst;
bld.const_ = const_;
bld.src_alpha = src_alpha;
+   bld.src1_alpha = src1_alpha;
bld.const_alpha = const_alpha;
 
/* Find the alpha channel if not provided seperately */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index f3bc516..b9c1567 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -115,7 +115,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_SM3:
   return 1;
case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
-  return 0;
+  return 1;
case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
   

<    1   2   3   4   5   6   7   >