Re: [Mesa-dev] [PATCH] gallivm: do per-pixel lod calculations for explicit lod

2013-07-03 Thread Roland Scheidegger
Am 03.07.2013 17:28, schrieb Jose Fonseca:
> I don't fully grasp every detail (many paths), but looks good in principle.
> 
> Where do the 16xf32 vectors come from?
Those are the size vectors. Normally (except for 1d case) these contain
width/height/depth/_ (or just width/height for 2d texture).
So, if there's one lod and we minify width/height/depth get all minified
in one step, and the sizes are then extracted from that.
In the 8-wide case if there's 2 lods the minfication is done separate
for both levels (because sse lacks the true vector shift) then the
vectors concatenated.
But now we've got (for the 8-wide case) 8 lods, hence 8 of these w/h/d/_
vectors all concatenated into one.
Now ideally I guess we'd just use separate vectors but that would mean
even more specialized code. Also, you have to consider even if you had a
separate "width" and "height" vector, it is impossible to minify them by
the lod values, it would be a buttload of scalar extraction, scalar
shifts and inserts (REALLY missing the true vector shift), unless you've
got AVX2 (or AMD Bulldozer) so it's probably still better to keep
w/h/d/_ vectors and just hope llvm does something reaosonable with them
(we hopefully shouldn't hit intrinsics as the vectors aren't really used
a lot).
In fact the code looks so butt ugly there for minification because
needed to keep a separate path for 4-wide and 8-wide. Because if you
have a w/h/d/_/w/h/d/_ vector and try to minify (right shift) by
l0l0l0l0l1l1l1l1 llvm cannot separate that into 2 4-wide parts last time
I checked, and that simple shift will be a mess of 16 extracts, 8 scalar
shifts and 8 inserts... All that because intel forgot the real vector
shift instruction before avx2, I don't think there's any other simd
instruction set which lacks it.

> 
> Also, please add a comment somewhere summarizing all the code paths for lod 
> handling:
> 
>  - AVX vs non AVX
>  - SOA vs AOS
>  - scalar lod vs stamp lod
Ok I'll add some more clarifying comments.

Roland


> 
> But I couldn't spot anything wrong.
> 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] gallivm: do per-pixel lod calculations for explicit lod

2013-07-03 Thread Jose Fonseca
I don't fully grasp every detail (many paths), but looks good in principle.

Where do the 16xf32 vectors come from?

Also, please add a comment somewhere summarizing all the code paths for lod 
handling:

 - AVX vs non AVX
 - SOA vs AOS
 - scalar lod vs stamp lod

But I couldn't spot anything wrong.

Jose


- Original Message -
> From: Roland Scheidegger 
> 
> d3d10 requires per-pixel lod calculations for explicit lod, lod bias and
> explicit derivatives, and we should probably do it for OpenGL too - at least
> if they are used from vertex or geometry shaders (so doesn't apply to lod
> bias) this doesn't just affect neighboring pixels.
> Some code was already there to handle this so fix it up and enable it.
> There will no doubt be a performance hit unfortunately, we could do better
> if we'd knew we had a real vector shift instruction (with variable shift
> count) but this requires AVX2 on x86 (or a AMD Bulldozer family cpu).
> Don't do anything for lod bias and explicit derivatives yet, though
> no special magic should be needed for them neither.
> Likewise, the size query is still broken just the same.
> 
> v2: Use information if lod is a (broadcast) scalar or not. The idea would be
> to base this on the actual value, for now just pretend it's a scalar in fs
> and not a scalar otherwise (so, per-pixel lod is only used in gs/vs but same
> code is generated for fs as before).
> ---
>  src/gallium/auxiliary/draw/draw_llvm_sample.c |3 +-
>  src/gallium/auxiliary/gallivm/lp_bld_sample.c |  110 -
>  src/gallium/auxiliary/gallivm/lp_bld_sample.h |   13 ++-
>  src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c |   26 ++---
>  src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  130
>  +
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.h   |1 +
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c   |   18 ++-
>  src/gallium/auxiliary/tgsi/tgsi_scan.c|1 +
>  src/gallium/auxiliary/tgsi/tgsi_scan.h|2 +
>  src/gallium/drivers/llvmpipe/lp_tex_sample.c  |3 +-
>  10 files changed, 181 insertions(+), 126 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c
> b/src/gallium/auxiliary/draw/draw_llvm_sample.c
> index e51e011..0cb5c21 100644
> --- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
> +++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
> @@ -238,6 +238,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct
> lp_build_sampler_soa *base,
> const struct lp_derivatives *derivs,
> LLVMValueRef lod_bias, /* optional */
> LLVMValueRef explicit_lod, /*
> optional */
> +   boolean scalar_lod,
> LLVMValueRef *texel)
>  {
> struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa
> *)base;
> @@ -256,7 +257,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct
> lp_build_sampler_soa *base,
> coords,
> offsets,
> derivs,
> -   lod_bias, explicit_lod,
> +   lod_bias, explicit_lod, scalar_lod,
> texel);
>  }
>  
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> index d689c7b..c2efec9 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
> @@ -215,7 +215,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
> struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
> struct lp_build_context *float_bld = &bld->float_bld;
> struct lp_build_context *coord_bld = &bld->coord_bld;
> -   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
> +   struct lp_build_context *levelf_bld = &bld->levelf_bld;
> const unsigned dims = bld->dims;
> LLVMValueRef ddx_ddy[2];
> LLVMBuilderRef builder = bld->gallivm->builder;
> @@ -235,6 +235,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
>  
> /* Note that all simplified calculations will only work for isotropic
> filtering */
>  
> +   assert(bld->num_lods != length);
> +
> first_level = bld->dynamic_state->first_level(bld->dynamic_state,
>   bld->gallivm,
>   texture_unit);
> first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
> @@ -248,14 +250,14 @@ lp_build_rho(struct lp_build_sample_context *bld,
> * Cube map code did already everything except size mul and per-quad
> extraction.
> */
>rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
> -  perquadf_bld->type, cube_rho, 0);
> +

[Mesa-dev] [PATCH] gallivm: do per-pixel lod calculations for explicit lod

2013-07-03 Thread sroland
From: Roland Scheidegger 

d3d10 requires per-pixel lod calculations for explicit lod, lod bias and
explicit derivatives, and we should probably do it for OpenGL too - at least
if they are used from vertex or geometry shaders (so doesn't apply to lod
bias) this doesn't just affect neighboring pixels.
Some code was already there to handle this so fix it up and enable it.
There will no doubt be a performance hit unfortunately, we could do better
if we'd knew we had a real vector shift instruction (with variable shift
count) but this requires AVX2 on x86 (or a AMD Bulldozer family cpu).
Don't do anything for lod bias and explicit derivatives yet, though
no special magic should be needed for them neither.
Likewise, the size query is still broken just the same.

v2: Use information if lod is a (broadcast) scalar or not. The idea would be
to base this on the actual value, for now just pretend it's a scalar in fs
and not a scalar otherwise (so, per-pixel lod is only used in gs/vs but same
code is generated for fs as before).
---
 src/gallium/auxiliary/draw/draw_llvm_sample.c |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  110 -
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |   13 ++-
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c |   26 ++---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  130 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi.h   |1 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c   |   18 ++-
 src/gallium/auxiliary/tgsi/tgsi_scan.c|1 +
 src/gallium/auxiliary/tgsi/tgsi_scan.h|2 +
 src/gallium/drivers/llvmpipe/lp_tex_sample.c  |3 +-
 10 files changed, 181 insertions(+), 126 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c 
b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index e51e011..0cb5c21 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -238,6 +238,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct 
lp_build_sampler_soa *base,
const struct lp_derivatives *derivs,
LLVMValueRef lod_bias, /* optional */
LLVMValueRef explicit_lod, /* optional 
*/
+   boolean scalar_lod,
LLVMValueRef *texel)
 {
struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa 
*)base;
@@ -256,7 +257,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct 
lp_build_sampler_soa *base,
coords,
offsets,
derivs,
-   lod_bias, explicit_lod,
+   lod_bias, explicit_lod, scalar_lod,
texel);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d689c7b..c2efec9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -215,7 +215,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
struct lp_build_context *float_bld = &bld->float_bld;
struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
const unsigned dims = bld->dims;
LLVMValueRef ddx_ddy[2];
LLVMBuilderRef builder = bld->gallivm->builder;
@@ -235,6 +235,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
 
/* Note that all simplified calculations will only work for isotropic 
filtering */
 
+   assert(bld->num_lods != length);
+
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
  bld->gallivm, texture_unit);
first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
@@ -248,14 +250,14 @@ lp_build_rho(struct lp_build_sample_context *bld,
* Cube map code did already everything except size mul and per-quad 
extraction.
*/
   rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-  perquadf_bld->type, cube_rho, 0);
+  levelf_bld->type, cube_rho, 0);
   if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
- rho = lp_build_sqrt(perquadf_bld, rho);
+ rho = lp_build_sqrt(levelf_bld, rho);
   }
   /* Could optimize this for single quad just skip the broadcast */
   cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-perquadf_bld->type, float_size, 
index0);
-  rho = lp_build_mul(perquadf_bld, cubesize, rho);
+levelf_bld->type, float_size, 
index0);
+  rho = lp_build_mul(leve

[Mesa-dev] [PATCH] gallivm: do per-pixel lod calculations for explicit lod

2013-06-27 Thread sroland
From: Roland Scheidegger 

d3d10 requires per-pixel lod calculations for explicit lod, lod bias and
explicit derivatives, and we should probably do it for OpenGL too - at least
if they are used from vertex or geometry shaders (so doesn't apply to lod
bias) this doesn't just affect neighboring pixels.
Some code was already there to handle this so fix it up and enable it.
There will no doubt be a performance hit unfortunately, we could do better
if we'd knew we had a real vector shift instruction (with variable shift
count) but this requires AVX2 on x86 (or a AMD Bulldozer family cpu).
Don't do anything for lod bias and explicit derivatives yet, though
no special magic should be needed for them neither.
Likewise, the size query is still broken just the same.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  110 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |   12 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c |   26 ++---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  128 +
 4 files changed, 155 insertions(+), 121 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d689c7b..c2efec9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -215,7 +215,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
struct lp_build_context *float_bld = &bld->float_bld;
struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   struct lp_build_context *levelf_bld = &bld->levelf_bld;
const unsigned dims = bld->dims;
LLVMValueRef ddx_ddy[2];
LLVMBuilderRef builder = bld->gallivm->builder;
@@ -235,6 +235,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
 
/* Note that all simplified calculations will only work for isotropic 
filtering */
 
+   assert(bld->num_lods != length);
+
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
  bld->gallivm, texture_unit);
first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
@@ -248,14 +250,14 @@ lp_build_rho(struct lp_build_sample_context *bld,
* Cube map code did already everything except size mul and per-quad 
extraction.
*/
   rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-  perquadf_bld->type, cube_rho, 0);
+  levelf_bld->type, cube_rho, 0);
   if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
- rho = lp_build_sqrt(perquadf_bld, rho);
+ rho = lp_build_sqrt(levelf_bld, rho);
   }
   /* Could optimize this for single quad just skip the broadcast */
   cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-perquadf_bld->type, float_size, 
index0);
-  rho = lp_build_mul(perquadf_bld, cubesize, rho);
+levelf_bld->type, float_size, 
index0);
+  rho = lp_build_mul(levelf_bld, cubesize, rho);
}
else if (derivs && !(bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE)) {
   LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -289,12 +291,12 @@ lp_build_rho(struct lp_build_sample_context *bld,
  }
  rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- perquadf_bld->type, rho_vec, 0);
+ levelf_bld->type, rho_vec, 0);
  /*
   * note that as long as we don't care about per-pixel lod could 
reduce math
   * more (at some shuffle cost), but for now only do sqrt after 
packing.
   */
- rho = lp_build_sqrt(perquadf_bld, rho);
+ rho = lp_build_sqrt(levelf_bld, rho);
   }
   else {
  rho_vec = ddmax[0];
@@ -309,7 +311,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
   * since we can't handle per-pixel rho/lod from now on (TODO).
   */
  rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- perquadf_bld->type, rho_vec, 0);
+ levelf_bld->type, rho_vec, 0);
   }
}
else {
@@ -381,8 +383,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
  rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 
  rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- perquadf_bld->type, rho_vec, 0);
- rho = lp_build_sqrt(perquadf_bld, rho);
+ levelf_bld->type, rho_vec, 0);
+ rho = lp_build_sqrt(levelf_bld, rho);
   }
   else {