[Mesa-dev] [PATCH] util/atomic: Fix p_atomic_add for unlocked and msvc paths

2019-12-09 Thread sroland
From: Roland Scheidegger 

Braces mismatch (flagged by CI, untested).

Fixes: 385d13f26d2 "util/atomic: Add a _return variant of p_atomic_add"
---
 src/util/u_atomic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/util/u_atomic.h b/src/util/u_atomic.h
index 9cbc6dd1eaa..1ad87c8feb1 100644
--- a/src/util/u_atomic.h
+++ b/src/util/u_atomic.h
@@ -89,7 +89,7 @@
 #define p_atomic_dec_zero(_v) (p_atomic_dec_return(_v) == 0)
 #define p_atomic_inc(_v) ((void) p_atomic_inc_return(_v))
 #define p_atomic_dec(_v) ((void) p_atomic_dec_return(_v))
-#define p_atomic_add(_v, _i) ((void) p_atomic_add_return((_v), (_i))
+#define p_atomic_add(_v, _i) ((void) p_atomic_add_return((_v), (_i)))
 #define p_atomic_inc_return(_v) (++(*(_v)))
 #define p_atomic_dec_return(_v) (--(*(_v)))
 #define p_atomic_add_return(_v, _i) (*(_v) = *(_v) + (_i))
@@ -146,7 +146,7 @@
  (assert(!"should not get here"), 0))
 
 #define p_atomic_add(_v, _i) \
-   ((void) p_atomic_add_return((_v), (_i))
+   ((void) p_atomic_add_return((_v), (_i)))
 
 #define p_atomic_add_return(_v, _i) (\
sizeof *(_v) == sizeof(char)? _InterlockedExchangeAdd8 ((char *)   
(_v), (_i)) : \
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: Fix saturated signed psub/padd intrinsics on llvm 8

2019-10-16 Thread sroland
From: Roland Scheidegger 

LLVM 8 did remove both the signed and unsigned sse2/avx intrinsics in
the end, and provide arch-independent llvm intrinsics instead.
Fixes a crash when using snorm framebuffers (tested with piglit
arb_color_buffer_float-render GL_RGBA8_SNORM -auto).

CC: 
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 28 -
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 6b7ce9aacf9..53ee00e6767 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -465,7 +465,7 @@ lp_build_add(struct lp_build_context *bld,
 return bld->one;
 
   if (!type.floating && !type.fixed) {
- if (LLVM_VERSION_MAJOR >= 9) {
+ if (LLVM_VERSION_MAJOR >= 8) {
 char intrin[32];
 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, 
bld->vec_type);
@@ -474,11 +474,9 @@ lp_build_add(struct lp_build_context *bld,
  if (type.width * type.length == 128) {
 if (util_cpu_caps.has_sse2) {
if (type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
- LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.sse2.paddus.b" : NULL;
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : 
"llvm.x86.sse2.paddus.b";
if (type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
- LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.sse2.paddus.w" : NULL;
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : 
"llvm.x86.sse2.paddus.w";
 } else if (util_cpu_caps.has_altivec) {
if (type.width == 8)
   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : 
"llvm.ppc.altivec.vaddubs";
@@ -489,11 +487,9 @@ lp_build_add(struct lp_build_context *bld,
  if (type.width * type.length == 256) {
 if (util_cpu_caps.has_avx2) {
if (type.width == 8)
-  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
-  LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.avx2.paddus.b" : NULL;
+  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : 
"llvm.x86.avx2.paddus.b";
if (type.width == 16)
-  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
-  LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.avx2.paddus.w" : NULL;
+  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : 
"llvm.x86.avx2.paddus.w";
 }
  }
   }
@@ -793,7 +789,7 @@ lp_build_sub(struct lp_build_context *bld,
 return bld->zero;
 
   if (!type.floating && !type.fixed) {
- if (LLVM_VERSION_MAJOR >= 9) {
+ if (LLVM_VERSION_MAJOR >= 8) {
 char intrin[32];
 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, 
bld->vec_type);
@@ -802,11 +798,9 @@ lp_build_sub(struct lp_build_context *bld,
  if (type.width * type.length == 128) {
 if (util_cpu_caps.has_sse2) {
if (type.width == 8)
-  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
-  LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.sse2.psubus.b" : NULL;
+  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : 
"llvm.x86.sse2.psubus.b";
if (type.width == 16)
-  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
-  LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.sse2.psubus.w" : NULL;
+  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : 
"llvm.x86.sse2.psubus.w";
 } else if (util_cpu_caps.has_altivec) {
if (type.width == 8)
   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : 
"llvm.ppc.altivec.vsububs";
@@ -817,11 +811,9 @@ lp_build_sub(struct lp_build_context *bld,
  if (type.width * type.length == 256) {
 if (util_cpu_caps.has_avx2) {
if (type.width == 8)
-  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
-  LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.avx2.psubus.b" : NULL;
+  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : 
"llvm.x86.avx2.psubus.b";
if (type.width == 16)
-  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
-  LLVM_VERSION_MAJOR < 8 ? 
"llvm.x86.avx2.psubus.w" : NULL;
+  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : 
"llvm.x86.avx2.psubus.w";
 }
  }
   }
-- 
2.17.1

___

[Mesa-dev] [PATCH] llvmpipe: increase max texture size to 2GB

2019-10-10 Thread sroland
From: Roland Scheidegger 

The 1GB limit was arbitrary, increase this to 2GB (which is the max
possible without code changes).
---
 src/gallium/drivers/llvmpipe/lp_limits.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h 
b/src/gallium/drivers/llvmpipe/lp_limits.h
index c2808162c78..569179ecdf4 100644
--- a/src/gallium/drivers/llvmpipe/lp_limits.h
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -43,7 +43,11 @@
 /**
  * Max texture sizes
  */
-#define LP_MAX_TEXTURE_SIZE (1 * 1024 * 1024 * 1024ULL)  /* 1GB for now */
+/**
+ * 2GB is the actual max currently (we always use 32bit offsets, and both
+ * llvm GEP as well as avx2 gather use signed offsets).
+ */
+#define LP_MAX_TEXTURE_SIZE (2 * 1024 * 1024 * 1024ULL)
 #define LP_MAX_TEXTURE_2D_LEVELS 14  /* 8K x 8K for now */
 #define LP_MAX_TEXTURE_3D_LEVELS 12  /* 2K x 2K x 2K for now */
 #define LP_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] llvmpipe: fix CALLOC vs. free mismatches

2019-09-05 Thread sroland
From: Roland Scheidegger 

Should fix some issues we're seeing. And use REALLOC instead of realloc.
---
 src/gallium/drivers/llvmpipe/lp_cs_tpool.c | 6 +++---
 src/gallium/drivers/llvmpipe/lp_state_cs.c | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_cs_tpool.c 
b/src/gallium/drivers/llvmpipe/lp_cs_tpool.c
index 04495727e1c..6f1b4e2ee55 100644
--- a/src/gallium/drivers/llvmpipe/lp_cs_tpool.c
+++ b/src/gallium/drivers/llvmpipe/lp_cs_tpool.c
@@ -65,7 +65,7 @@ lp_cs_tpool_worker(void *data)
  cnd_broadcast(&task->finish);
}
mtx_unlock(&pool->m);
-   free(lmem.local_mem_ptr);
+   FREE(lmem.local_mem_ptr);
return 0;
 }
 
@@ -105,7 +105,7 @@ lp_cs_tpool_destroy(struct lp_cs_tpool *pool)
 
cnd_destroy(&pool->new_work);
mtx_destroy(&pool->m);
-   free(pool);
+   FREE(pool);
 }
 
 struct lp_cs_tpool_task *
@@ -148,6 +148,6 @@ lp_cs_tpool_wait_for_task(struct lp_cs_tpool *pool,
mtx_unlock(&pool->m);
 
cnd_destroy(&task->finish);
-   free(task);
+   FREE(task);
*task_handle = NULL;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_cs.c 
b/src/gallium/drivers/llvmpipe/lp_state_cs.c
index 1645a185cb2..a26cbf4df22 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_cs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_cs.c
@@ -1123,8 +1123,9 @@ cs_exec_fn(void *init_data, int iter_idx, struct 
lp_cs_local_mem *lmem)
memset(&thread_data, 0, sizeof(thread_data));
 
if (lmem->local_size < job_info->req_local_mem) {
+  lmem->local_mem_ptr = REALLOC(lmem->local_mem_ptr, lmem->local_size,
+job_info->req_local_mem);
   lmem->local_size = job_info->req_local_mem;
-  lmem->local_mem_ptr = realloc(lmem->local_mem_ptr, lmem->local_size);
}
thread_data.shared = lmem->local_mem_ptr;
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: use fallback code for mul_hi with llvm >= 7.0

2019-08-28 Thread sroland
From: Roland Scheidegger 

LLVM 7.0 ditched the pmulu intrinsics.
This is only a trivial patch to use the fallback code instead.
It'll likely produce atrocious code since the pattern doesn't match what
llvm itself uses in its autoupgrade paths, hence the pattern won't be
recognized.

Should fix https://bugs.freedesktop.org/show_bug.cgi?id=111496
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index c4931c0b230..f1866c6625f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1169,8 +1169,13 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
 * https://llvm.org/bugs/show_bug.cgi?id=30845
 * So, whip up our own code, albeit only for length 4 and 8 (which
 * should be good enough)...
+* FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
+* (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
+* for signed), which the fallback code does not, without this llvm
+* will likely still produce atrocious code.
 */
-   if ((bld->type.length == 4 || bld->type.length == 8) &&
+   if (HAVE_LLVM < 0x0700 &&
+   (bld->type.length == 4 || bld->type.length == 8) &&
((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
 util_cpu_caps.has_sse4_1)) {
   const char *intrinsic = NULL;
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: fix issue with AtomicCmpXchg wrapper on llvm 3.5-3.8

2019-08-02 Thread sroland
From: Roland Scheidegger 

These versions still need wrapper but already have both success and
failure ordering.
(Compile tested on llvm 3.7, llvm 3.8.)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=02
---
 src/gallium/auxiliary/gallivm/lp_bld_misc.cpp | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp 
b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 79d10293e80..723c84d57c2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -822,15 +822,29 @@ static llvm::AtomicOrdering 
mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) {
llvm_unreachable("Invalid LLVMAtomicOrdering value!");
 }
 
+#if HAVE_LLVM < 0x305
 LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
 LLVMValueRef Cmp, LLVMValueRef New,
 LLVMAtomicOrdering SuccessOrdering,
 LLVMAtomicOrdering FailureOrdering,
 LLVMBool SingleThread)
 {
-   /* LLVM 3.8 doesn't have a second ordering and uses old 
SynchronizationScope enum */
+   /* LLVM < 3.5 doesn't have a second ordering and uses old 
SynchronizationScope enum */
return llvm::wrap(llvm::unwrap(B)->CreateAtomicCmpXchg(llvm::unwrap(Ptr), 
llvm::unwrap(Cmp),
   llvm::unwrap(New), 
mapFromLLVMOrdering(SuccessOrdering),
   SingleThread ? 
llvm::SynchronizationScope::SingleThread : 
llvm::SynchronizationScope::CrossThread));
 }
+#else
+LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
+LLVMValueRef Cmp, LLVMValueRef New,
+LLVMAtomicOrdering SuccessOrdering,
+LLVMAtomicOrdering FailureOrdering,
+LLVMBool SingleThread)
+{
+   return llvm::wrap(llvm::unwrap(B)->CreateAtomicCmpXchg(llvm::unwrap(Ptr), 
llvm::unwrap(Cmp),
+  llvm::unwrap(New), 
mapFromLLVMOrdering(SuccessOrdering),
+  
mapFromLLVMOrdering(FailureOrdering),
+  SingleThread ? 
llvm::SynchronizationScope::SingleThread : 
llvm::SynchronizationScope::CrossThread));
+}
+#endif
 #endif
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] scons: fix build with llvm 9.

2019-05-23 Thread sroland
From: Roland Scheidegger 

The x86asmprinter component is gone, and things seem to work by just
removing it.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110707
---
 scons/llvm.py | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scons/llvm.py b/scons/llvm.py
index a84ad51d97a..bf9666459c6 100644
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -260,7 +260,10 @@ def generate(env):
 if '-fno-rtti' in cxxflags:
 env.Append(CXXFLAGS = ['-fno-rtti'])
 
-components = ['engine', 'mcjit', 'bitwriter', 'x86asmprinter', 
'mcdisassembler', 'irreader']
+if llvm_version < distutils.version.LooseVersion('9.0'):
+   components = ['engine', 'mcjit', 'bitwriter', 'x86asmprinter', 
'mcdisassembler', 'irreader']
+else:
+   components = ['engine', 'mcjit', 'bitwriter', 'mcdisassembler', 
'irreader']
 
 env.ParseConfig('%s --libs ' % llvm_config + ' '.join(components))
 env.ParseConfig('%s --ldflags' % llvm_config)
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: fix default cbuf info.

2019-05-23 Thread sroland
From: Roland Scheidegger 

The default null_output really needs to be static, otherwise the values
we'll eventually get later are doubly random (they are not initialized,
and even if they were it's a pointer to a local stack variable).
VMware bug 2349556.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index b4e3c2fbc8..9fc9b8c77e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -608,7 +608,7 @@ finished:
 */
 
for (index = 0; index < PIPE_MAX_COLOR_BUFS; ++index) {
-  const struct lp_tgsi_channel_info null_output[4];
+  static const struct lp_tgsi_channel_info null_output[4];
   info->cbuf[index] = null_output;
}
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] auxiliary/draw: fix crash with zero-stride draw auto

2019-05-15 Thread sroland
From: Roland Scheidegger 

transform feedback draws get the number of vertices from the transform
feedback object. In draw, we'll figure this out with the number of bytes
written divided by the stride. However, it is apparently possible we end
up with a stride of 0 there (not entirely sure it could happen with GL).
Probably when nothing was actually ever written (so we don't actually
have a stride set). Just avoid the division by zero by setting the count
to 0.
---
 src/gallium/auxiliary/draw/draw_pt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt.c 
b/src/gallium/auxiliary/draw/draw_pt.c
index 50286149cd4..eeebca30ce7 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -440,7 +440,8 @@ resolve_draw_info(const struct pipe_draw_info *raw_info,
   struct draw_so_target *target =
  (struct draw_so_target *)info->count_from_stream_output;
   assert(vertex_buffer != NULL);
-  info->count = target->internal_offset / vertex_buffer->stride;
+  info->count = vertex_buffer->stride == 0 ? 0 :
+   target->internal_offset / vertex_buffer->stride;
 
   /* Stream output draw can not be indexed */
   debug_assert(!info->index_size);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: fix broken 8-wide s3tc decoding

2019-05-06 Thread sroland
From: Roland Scheidegger 

Brian noticed there was an uninitialized var for the 8-wide case and 128
bit blocks, which made it always crash. Likewise, the 64bit block case
had another crash bug due to type mismatch.
Color decode (used for all s3tc formats) also had a bogus shuffle for
this case, leading to decode artifacts.
Fix these all up, which makes the code actually work 8-wide. Note that
it's still not used - I've verified it works, and the generated assembly
does look quite a bit simpler actually (20-30% less instructions for the
s3tc decode part with avx2), however in practice it still seems to be
sligthly slower for some unknown reason (tested with openarena) on my
haswell box, so for now continue to split things into 4-wide vectors
before decoding.
---
 .../auxiliary/gallivm/lp_bld_format_s3tc.c| 33 +--
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
index 9561c349dad..8f6e9bec18a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
@@ -77,24 +77,17 @@ lp_build_uninterleave2_half(struct gallivm_state *gallivm,
 unsigned lo_hi)
 {
LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
+   unsigned i;
 
assert(type.length <= LP_MAX_VECTOR_LENGTH);
assert(lo_hi < 2);
 
if (type.length * type.width == 256) {
-  assert(type.length >= 4);
-  for (i = 0, j = 0; i < type.length; ++i) {
- if (i == type.length / 4) {
-j = type.length;
- } else if (i == type.length / 2) {
-j = type.length / 2;
- } else if (i == 3 * type.length / 4) {
-j = 3 * type.length / 4;
- } else {
-j += 2;
- }
- elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
+  assert(type.length == 8);
+  assert(type.width == 32);
+  const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
+  for (i = 0; i < type.length; ++i) {
+ elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
   }
} else {
   for (i = 0; i < type.length; ++i) {
@@ -277,7 +270,7 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,
}
else {
   LLVMValueRef tmp[4], cc01, cc23;
-  struct lp_type lp_type32, lp_type64, lp_type32dxt;
+  struct lp_type lp_type32, lp_type64;
   memset(&lp_type32, 0, sizeof lp_type32);
   lp_type32.width = 32;
   lp_type32.length = length;
@@ -309,10 +302,14 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,
   
lp_build_const_extend_shuffle(gallivm, 2, 4), "");
  }
  if (length == 8) {
+struct lp_type lp_type32_4;
+memset(&lp_type32_4, 0, sizeof lp_type32_4);
+lp_type32_4.width = 32;
+lp_type32_4.length = 4;
 for (i = 0; i < 4; ++i) {
tmp[0] = elems[i];
tmp[1] = elems[i+4];
-   elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
+   elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
 }
  }
  cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], 
elems[1], 0);
@@ -811,7 +808,7 @@ s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
bit_pos = LLVMBuildAnd(builder, bit_pos,
   lp_build_const_int_vec(gallivm, type, 0xffdf), 
"");
-   /* Warning: slow shift with per element count */
+   /* Warning: slow shift with per element count (without avx2) */
/*
 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
 * to select the right byte with pshufb. Then for the remaining one bit
@@ -1640,7 +1637,6 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
   lp_build_const_int_vec(gallivm, type16, 8), "");
alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
-   /* XXX this shuffle broken with LLVM 2.8 */
alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
 
@@ -2176,6 +2172,9 @@ lp_build_fetch_s3tc_rgba_aos(struct gallivm_state 
*gallivm,
   return rgba;
}
 
+   /*
+* Could use n > 8 here with avx2, but doesn't seem faster.
+*/
if (n > 4) {
   unsigned count;
   LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: fix saturated signed add / sub with llvm 9

2019-04-16 Thread sroland
From: Roland Scheidegger 

llvm 8 removed saturated unsigned add / sub x86 sse2 intrinsics, and
now llvm 9 removed the signed versions as well - they were proposed for
removal earlier, but the pattern to recognize those was very complex,
so it wasn't done then. However, instead of these arch-specific
intrinsics, there's now arch-independent intrinsics for saturated
add / sub, both for signed and unsigned, so use these.
They should have only advantages (work with arbitrary vector sizes,
optimal code for all archs), although I don't know how well they work
in practice for other archs (at least for x86 they do the right thing).

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110454
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 057c50ed278..02fb81afe51 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -555,6 +555,12 @@ lp_build_add(struct lp_build_context *bld,
 return bld->one;
 
   if (!type.floating && !type.fixed) {
+ if (HAVE_LLVM >= 0x0900) {
+char intrin[32];
+intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
+lp_format_intrinsic(intrin, sizeof intrin, intrinsic, 
bld->vec_type);
+return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, 
a, b);
+ }
  if (type.width * type.length == 128) {
 if (util_cpu_caps.has_sse2) {
if (type.width == 8)
@@ -625,6 +631,7 @@ lp_build_add(struct lp_build_context *bld,
   * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
   * interfere with llvm's ability to recognize the pattern but seems
   * a bit brittle.
+  * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
   */
  LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, 
res);
  res = lp_build_select(bld, overflowed,
@@ -876,6 +883,12 @@ lp_build_sub(struct lp_build_context *bld,
 return bld->zero;
 
   if (!type.floating && !type.fixed) {
+ if (HAVE_LLVM >= 0x0900) {
+char intrin[32];
+intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
+lp_format_intrinsic(intrin, sizeof intrin, intrinsic, 
bld->vec_type);
+return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, 
a, b);
+ }
  if (type.width * type.length == 128) {
 if (util_cpu_caps.has_sse2) {
if (type.width == 8)
@@ -925,6 +938,7 @@ lp_build_sub(struct lp_build_context *bld,
   * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
   * interfere with llvm's ability to recognize the pattern but seems
   * a bit brittle.
+  * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
   */
  LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  a = lp_build_select(bld, no_ov, a, b);
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: fix bogus assert in get_indirect_index

2019-04-15 Thread sroland
From: Roland Scheidegger 

0 is a valid value as max index, and the code handles it fine. This isn't
commonly seen, as it will only happen with array declarations of size 1.
The assert was introduced with a3c898dc97ec5f0e0b93b2ee180bdf8ca3bab14c.

Fixes piglit tests/shaders/complex-loop-analysis-bug.shader_test
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110441
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 0f5b3d9acb7..d6af1d84471 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1108,7 +1108,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
 * larger than the declared size but smaller than the buffer size.
 */
if (reg_file != TGSI_FILE_CONSTANT) {
-  assert(index_limit > 0);
+  assert(index_limit >= 0);
   max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
  uint_bld->type, index_limit);
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] gallivm: abort when trying to use non-existing intrinsic

2018-12-20 Thread sroland
From: Roland Scheidegger 

Whenever llvm removes an intrinsic (we're using), we're hitting segfaults
due to llvm doing calls to address 0 in the jitted code instead.
However, Jose figured out we can actually detect this with
LLVMGetIntrinsicID(), so use this to abort, so we don't have to wonder
what got broken. (Of course, someone still needs to fix the code to
no longer use this intrinsic.)
---
 src/gallium/auxiliary/gallivm/lp_bld_intr.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c 
b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 74ed16f33f0..c9df136b103 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -241,6 +241,16 @@ lp_build_intrinsic(LLVMBuilderRef builder,
 
   function = lp_declare_intrinsic(module, name, ret_type, arg_types, 
num_args);
 
+  /*
+   * If llvm removes an intrinsic we use, we'll hit this abort (rather
+   * than a call to address zero in the jited code).
+   */
+  if (LLVMGetIntrinsicID(function) == 0) {
+ printf("llvm (version 0x%x) found no intrinsic for %s, going to 
crash...\n",
+HAVE_LLVM, name);
+ abort();
+  }
+
   if (!set_callsite_attrs)
  lp_add_func_attributes(function, attr_mask);
 
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: don't use pavg.b intrinsic on llvm >= 6.0

2018-12-20 Thread sroland
From: Roland Scheidegger 

This intrinsic disppeared with llvm 6.0, using it ends up in segfaults
(due to llvm issuing call to NULL address in the jited shaders).
Add code doing the same thing as the autoupgrade code in llvm so it
can be matched and replaced back with a pavgb.

While here, also improve lp_test_format, so it tests both with and without
cache (as it was, it tested the cache versions only, whereas cache is
actually disabled in llvmpipe, and in any case even with it enabled
vertex and geometry shaders wouldn't use it). (Although at least for
the unorm8 uncached fetch, the code is still quite different to what
llvmpipe is using, since that would use unorm8x16 type, whereas
the test code is using unorm8x4 type, hence disabling some intrinsic
paths.)

Fixes: 6f4083143bb8c478ccfcaef034d183d89b471993
---
 .../auxiliary/gallivm/lp_bld_format_s3tc.c| 55 +--
 src/gallium/drivers/llvmpipe/lp_test_format.c | 91 ++-
 2 files changed, 95 insertions(+), 51 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
index 2b143566f24..9561c349dad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
@@ -457,6 +457,50 @@ color_expand_565_to_(struct gallivm_state *gallivm,
 }
 
 
+/*
+ * Average two byte vectors. (Will always round up.)
+ */
+static LLVMValueRef
+lp_build_pavgb(struct lp_build_context *bld8,
+   LLVMValueRef v0,
+   LLVMValueRef v1)
+{
+   struct gallivm_state *gallivm = bld8->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   assert(bld8->type.width == 8);
+   assert(bld8->type.length == 16 || bld8->type.length == 32);
+   if (HAVE_LLVM < 0x0600) {
+  LLVMValueRef intrargs[2];
+  char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
+  "llvm.x86.sse2.pavg.b";
+  intrargs[0] = v0;
+  intrargs[1] = v1;
+  return lp_build_intrinsic(builder, intr_name,
+bld8->vec_type, intrargs, 2, 0);
+   } else {
+  /*
+   * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
+   * You better hope the backend code manages to detect the pattern, and
+   * the pattern doesn't change there...
+   */
+  struct lp_type type_ext = bld8->type;
+  LLVMTypeRef vec_type_ext;
+  LLVMValueRef res;
+  LLVMValueRef ext_one;
+  type_ext.width = 16;
+  vec_type_ext = lp_build_vec_type(gallivm, type_ext);
+  ext_one = lp_build_const_vec(gallivm, type_ext, 1);
+
+  v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
+  v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
+  res = LLVMBuildAdd(builder, v0, v1, "");
+  res = LLVMBuildAdd(builder, res, ext_one, "");
+  res = LLVMBuildLShr(builder, res, ext_one, "");
+  res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
+  return res;
+   }
+}
+
 /**
  * Calculate 1/3(v1-v0) + v0
  * and 2*1/3(v1-v0) + v0
@@ -602,13 +646,7 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
*/
   if ((util_cpu_caps.has_sse2 && n == 4) ||
   (util_cpu_caps.has_avx2 && n == 8)) {
- LLVMValueRef intrargs[2];
- char *intr_name = n == 8 ? "llvm.x86.avx2.pavg.b" :
-"llvm.x86.sse2.pavg.b";
- intrargs[0] = colors0;
- intrargs[1] = colors1;
- color2_2 = lp_build_intrinsic(builder, intr_name,
-   bld8.vec_type, intrargs, 2, 0);
+ color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
  color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
   }
   else {
@@ -1278,8 +1316,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
  /* same interleave as for lerp23 - correct result in 2nd element */
  intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 
0);
  intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, 
"");
- color2_2 = lp_build_intrinsic(builder, "llvm.x86.sse2.pavg.b",
-   bld8.vec_type, intrargs, 2, 0);
+ color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
   }
   else {
  LLVMValueRef v01, v0, v1, vhalf;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c 
b/src/gallium/drivers/llvmpipe/lp_test_format.c
index a8aa33d8ae9..885d886cfa9 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -44,8 +44,6 @@
 
 #include "lp_test.h"
 
-#define USE_TEXTURE_CACHE 1
-
 static struct lp_build_format_cache *cache_ptr;
 
 void
@@ -80,7 +78,8 @@ typedef void
 static LLVMValueRef
 add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
 const struct util_format_description *desc,
-struct lp_

[Mesa-dev] [PATCH] gallivm: remove unused float coord wrapping for aos sampling

2018-12-06 Thread sroland
From: Roland Scheidegger 

AoS sampling tries to use integers for coord wrapping when possible,
as it should be faster. However, for AVX, this was suboptimal, because
only floats can use 8x32bit vectors, whereas integers have to be split
into 4x32bit vectors. (I believe part of why it was slower was also
that at least earlier llvm versions had trouble optimizing it properly,
since you can still do simple bit ops with 8x32bit vectors, so a
sequence of int add / and / int add / and with such vectors would
actually end up doing 128bit inserts/extracts between the operations
instead of just doing the cheap 128bit ands.)
Hence, a special float coord wrapping path was added to AoS sampling.
But this path was actually disabled for a long time already, since we
found that just splitting everything before entering the AoS path was
still sligthly faster usually, so none of this float coord wrapping
code was used anymore (AoS sampling code, when avx2 isn't supported,
never sees vectors with length > 4). I thought it might be useful some
day again, but I'm not interested anymore in optimizing for very weird
instruction sets which have support for 256bit vectors for floats but
not for ints, so just drop it.
---
 .../auxiliary/gallivm/lp_bld_sample_aos.c | 530 +-
 1 file changed, 23 insertions(+), 507 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index c46749dbac8..ad3a9e4a4ca 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -131,68 +131,6 @@ lp_build_sample_wrap_nearest_int(struct 
lp_build_sample_context *bld,
 }
 
 
-/**
- * Build LLVM code for texture coord wrapping, for nearest filtering,
- * for float texcoords.
- * \param coord  the incoming texcoord (s,t or r)
- * \param length  the texture size along one dimension
- * \param offset  the texel offset along the coord axis
- * \param is_pot  if TRUE, length is a power of two
- * \param wrap_mode  one of PIPE_TEX_WRAP_x
- * \param icoord  the texcoord after wrapping, as int
- */
-static void
-lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
-   LLVMValueRef coord,
-   LLVMValueRef length,
-   LLVMValueRef offset,
-   boolean is_pot,
-   unsigned wrap_mode,
-   LLVMValueRef *icoord)
-{
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   LLVMValueRef length_minus_one;
-
-   switch(wrap_mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-  if (offset) {
- /* this is definitely not ideal for POT case */
- offset = lp_build_int_to_float(coord_bld, offset);
- offset = lp_build_div(coord_bld, offset, length);
- coord = lp_build_add(coord_bld, coord, offset);
-  }
-  /* take fraction, unnormalize */
-  coord = lp_build_fract_safe(coord_bld, coord);
-  coord = lp_build_mul(coord_bld, coord, length);
-  *icoord = lp_build_itrunc(coord_bld, coord);
-  break;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-  length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
-  if (bld->static_sampler_state->normalized_coords) {
- /* scale coord to length */
- coord = lp_build_mul(coord_bld, coord, length);
-  }
-  if (offset) {
- offset = lp_build_int_to_float(coord_bld, offset);
- coord = lp_build_add(coord_bld, coord, offset);
-  }
-  coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
- length_minus_one);
-  *icoord = lp_build_itrunc(coord_bld, coord);
-  break;
-
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-   default:
-  assert(0);
-   }
-}
-
-
 /**
  * Helper to compute the first coord and the weight for
  * linear wrap repeat npot textures
@@ -424,129 +362,6 @@ lp_build_sample_wrap_linear_int(struct 
lp_build_sample_context *bld,
 }
 
 
-/**
- * Build LLVM code for texture coord wrapping, for linear filtering,
- * for float texcoords.
- * \param block_length  is the length of the pixel block along the
- *  coordinate axis
- * \param coord  the incoming texcoord (s,t or r)
- * \param length  the texture size along one dimension
- * \param offset  the texel offset along the coord axis
- * \param is_pot  if TRUE, length is a power of two
- * \param wrap_mode  one of PIPE_TEX_WRAP_x
- * \param coord0  the first texcoord after wrapping, as int
- * \param coord1  the second texcoord after wrapping, as int
- * \param weight  the filter weight as int (0-255)
- * \param force_nearest  if this coord actually uses nearest filtering
- */
-

[Mesa-dev] [PATCH] draw: fix infinite loop in line stippling

2018-11-22 Thread sroland
From: Roland Scheidegger 

The calculated length of a line may be infinite, if the coords we
get are bogus. This leads to an infinite loop in line stippling.
To prevent this test for this explicitly (although technically
on at least x86 sse it would actually work without the explicit
test, as long as we use the int-converted length value).
While here also get rid of some always-true condition.

Note this does not actually solve the root cause, which is that
the coords we receive are bogus after clipping. This seems a difficult
problem to solve. One issue is that due to float arithmetic, clip w
may become 0 after clipping if the incoming geometry is
"sufficiently degenerate", hence x/y/z ndc (and window) coords will
be all inf (or nan). Even with w not quite 0, I believe it's possible
we produce values which are actually outside the view volume.
(Also, x=y=z=w=0 coords in clipspace would be not considered subject
to clipping, and similarly result in all NaN coords.) We just hope for
now other draw stages (and rasterizers) can handle those relatively
safely (llvmpipe itself should be sort of robust against this, certainly
converstion to fixed point will produce garbage, it might fail a couple
assertions but should neither hang nor crash otherwise).
---
 .../auxiliary/draw/draw_pipe_stipple.c| 26 +++
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c 
b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index d30572cc61..386b7649e4 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -48,8 +48,8 @@
 struct stipple_stage {
struct draw_stage stage;
float counter;
-   uint pattern;
-   uint factor;
+   ushort pattern;
+   ushort factor;
bool smooth;
 };
 
@@ -110,7 +110,7 @@ emit_segment(struct draw_stage *stage, struct prim_header 
*header,
 
 
 static inline bool
-stipple_test(int counter, ushort pattern, int factor)
+stipple_test(int counter, ushort pattern, ushort factor)
 {
int b = (counter / factor) & 0xf;
return !!((1 << b) & pattern);
@@ -136,6 +136,10 @@ stipple_line(struct draw_stage *stage, struct prim_header 
*header)
 
float length;
int i;
+   int intlength;
+
+   if (header->flags & DRAW_PIPE_RESET_STIPPLE)
+  stipple->counter = 0;
 
if (stipple->smooth) {
   float dx = x1 - x0;
@@ -147,21 +151,21 @@ stipple_line(struct draw_stage *stage, struct prim_header 
*header)
   length = MAX2(dx, dy);
}
 
-   if (header->flags & DRAW_PIPE_RESET_STIPPLE)
-  stipple->counter = 0;
+   if (util_is_inf_or_nan(length))
+  intlength = 0;
+   else
+  intlength = ceilf(length);
 
/* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table.
 */
-   for (i = 0; i < length; i++) {
+   for (i = 0; i < intlength; i++) {
   bool result = stipple_test((int)stipple->counter + i,
- (ushort)stipple->pattern, stipple->factor);
+ stipple->pattern, stipple->factor);
   if (result != state) {
  /* changing from "off" to "on" or vice versa */
  if (state) {
-if (start != i) {
-   /* finishing an "on" segment */
-   emit_segment(stage, header, start / length, i / length);
-}
+/* finishing an "on" segment */
+emit_segment(stage, header, start / length, i / length);
  }
  else {
 /* starting an "on" segment */
-- 
2.17.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix improper clamping of vertex index when fetching gs inputs

2018-11-07 Thread sroland
From: Roland Scheidegger 

Because we only have one file_max for the (2d) gs input file, the value
actually represents the max of attrib and vertex index (although I'm
not entirely sure if we really want the max, since the max valid value
of the vertex dimension can be easily deduced from the input primitive).

Thus in cases where the number of inputs is higher than the number of
vertices per prim, we did not properly clamp the vertex index, which
would result in out-of-bound fetches, potentially causing segfaults
(the segfaults seemed actually difficult to trigger, but valgrind
certainly wasn't happy). This might have happened even if the shader
did not actually try to fetch bogus vertices, if the fetching happened
in non-active conditional clauses.

To fix simply use the correct max vertex index value (derived from
the input prim type) instead when clamping for this case.
---
 .../auxiliary/gallivm/lp_bld_tgsi_soa.c   | 38 ++-
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 83d7dbea9a..0db81b31ad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -41,6 +41,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_info.h"
@@ -1059,7 +1060,8 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld,
 static LLVMValueRef
 get_indirect_index(struct lp_build_tgsi_soa_context *bld,
unsigned reg_file, unsigned reg_index,
-   const struct tgsi_ind_register *indirect_reg)
+   const struct tgsi_ind_register *indirect_reg,
+   unsigned index_limit)
 {
LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
struct lp_build_context *uint_bld = &bld->bld_base.uint_bld;
@@ -1107,8 +1109,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld,
 */
if (reg_file != TGSI_FILE_CONSTANT) {
   max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm,
- uint_bld->type,
- 
bld->bld_base.info->file_max[reg_file]);
+ uint_bld->type, index_limit);
 
   assert(!uint_bld->type.sign);
   index = lp_build_min(uint_bld, index, max_index);
@@ -1224,7 +1225,8 @@ emit_fetch_constant(
   indirect_index = get_indirect_index(bld,
   reg->Register.File,
   reg->Register.Index,
-  ®->Indirect);
+  ®->Indirect,
+  
bld->bld_base.info->file_max[reg->Register.File]);
 
   /* All fetches are from the same constant buffer, so
* we need to propagate the size to a vector to do a
@@ -1341,7 +1343,8 @@ emit_fetch_immediate(
  indirect_index = get_indirect_index(bld,
  reg->Register.File,
  reg->Register.Index,
- ®->Indirect);
+ ®->Indirect,
+ 
bld->bld_base.info->file_max[reg->Register.File]);
  /*
   * Unlike for other reg classes, adding pixel offsets is unnecessary -
   * immediates are stored as full vectors (FIXME??? - might be better
@@ -1414,7 +1417,8 @@ emit_fetch_input(
   indirect_index = get_indirect_index(bld,
   reg->Register.File,
   reg->Register.Index,
-  ®->Indirect);
+  ®->Indirect,
+  
bld->bld_base.info->file_max[reg->Register.File]);
 
   index_vec = get_soa_array_offsets(&bld_base->uint_bld,
 indirect_index,
@@ -1502,7 +1506,15 @@ emit_fetch_gs_input(
   attrib_index = get_indirect_index(bld,
 reg->Register.File,
 reg->Register.Index,
-®->Indirect);
+®->Indirect,
+   /*
+* XXX: this is possibly not quite the right value, since file_max may be
+* larger than the max attrib index, due to it being the max of declared
+* inputs AND the max vertices per prim (which is 6 for tri adj).
+* It should however be safe to use (since we always allocate
+* PIPE_MAX_SHADER_INPUTS (80) for it, which is overallocated quite a bit).
+*/
+info->file_max[reg->R

[Mesa-dev] [PATCH] gallivm: don't use saturated unsigned add/sub intrinsics for llvm 8.0

2018-08-23 Thread sroland
From: Roland Scheidegger 

These have been removed. Unfortunately auto-upgrade doesn't work for
jit. (Worse, it seems we don't get a compilation error anymore when
compiling the shader, rather llvm will just do a call to a null
function in the jitted shaders making it difficult to detect when
intrinsics vanish.)

Luckily the signed ones are still there, I helped convincing llvm
removing them is a bad idea for now, since while the unsigned ones have
sort of agreed-upon simplest patterns to replace them with, this is not
the case for the signed ones, and they require _significantly_ more
complex patterns - to the point that the recognition is IMHO probably
unlikely to ever work reliably in practice (due to other optimizations
interfering). (Even for the relatively trivial unsigned patterns, llvm
already added test cases where recognition doesn't work, unsaturated
add followed by saturated add may produce atrocious code.)
Nevertheless, it seems there's a serious quest to squash all
cpu-specific intrinsics going on, so I'd expect patches to nuke them as
well to resurface.

Adapt the existing fallback code to match the simple patterns llvm uses
and hope for the best. I've verified with lp_test_blend that it does
produce the expected saturated assembly instructions. Though our
cmp/select build helpers don't use boolean masks, but it doesn't seem
to interfere with llvm's ability to recognize the pattern.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106231
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 87 ++---
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index e922474ef61..f348833206b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -557,23 +557,27 @@ lp_build_add(struct lp_build_context *bld,
   if (!type.floating && !type.fixed) {
  if (type.width * type.length == 128) {
 if (util_cpu_caps.has_sse2) {
-  if (type.width == 8)
-intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : 
"llvm.x86.sse2.paddus.b";
-  if (type.width == 16)
-intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : 
"llvm.x86.sse2.paddus.w";
+   if (type.width == 8)
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
+ HAVE_LLVM < 0x0800 ? 
"llvm.x86.sse2.paddus.b" : NULL;
+   if (type.width == 16)
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
+ HAVE_LLVM < 0x0800 ? 
"llvm.x86.sse2.paddus.w" : NULL;
 } else if (util_cpu_caps.has_altivec) {
-  if (type.width == 8)
- intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : 
"llvm.ppc.altivec.vaddubs";
-  if (type.width == 16)
- intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : 
"llvm.ppc.altivec.vadduhs";
+   if (type.width == 8)
+  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : 
"llvm.ppc.altivec.vaddubs";
+   if (type.width == 16)
+  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : 
"llvm.ppc.altivec.vadduhs";
 }
  }
  if (type.width * type.length == 256) {
 if (util_cpu_caps.has_avx2) {
-  if (type.width == 8)
-intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : 
"llvm.x86.avx2.paddus.b";
-  if (type.width == 16)
-intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : 
"llvm.x86.avx2.paddus.w";
+   if (type.width == 8)
+  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
+  HAVE_LLVM < 0x0800 ? 
"llvm.x86.avx2.paddus.b" : NULL;
+   if (type.width == 16)
+  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
+  HAVE_LLVM < 0x0800 ? 
"llvm.x86.avx2.paddus.w" : NULL;
 }
  }
   }
@@ -592,8 +596,6 @@ lp_build_add(struct lp_build_context *bld,
  LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, 
LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, 
LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, 
bld->zero), a_clamp_max, a_clamp_min);
-  } else {
- a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), 
GALLIVM_NAN_BEHAVIOR_UNDEFINED);
   }
}
 
@@ -612,6 +614,24 @@ lp_build_add(struct lp_build_context *bld,
if(bld->type.norm && (bld->type.floating || bld->type.fixed))
   res = lp_build_min_simple(bld, res, bld->one, 
GALLIVM_NAN_BEHAVIOR_UNDEFINED);
 
+   if (type.norm && !type.floating && !type.

[Mesa-dev] [PATCH] util: return 0 for NaNs in float_to_ubyte

2018-08-02 Thread sroland
From: Roland Scheidegger 

d3d10 requires NaNs to get converted to 0 for float->unorm conversions
(and float->int etc.). GL spec probably doesn't care in general, but it
would make sense to have reasonable behavior in any case imho - the
old code was converting negative NaNs to 0, and positive NaNs to 255.
(Note that using float comparison isn't actually all that much more
effort in any case, at least with sse2 it's just float comparison
(ucommiss) instead of int one - I converted the second comparison
to float too simply because it saves the probably somewhat expensive
transfer of the float from simd to int domain (with sse2 via stack),
so the generated code actually has 2 less instructions, although float
comparisons are more expensive than int ones.)
---
 src/gallium/auxiliary/util/u_math.h | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_math.h 
b/src/gallium/auxiliary/util/u_math.h
index 79869a1..712305c 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -360,7 +360,6 @@ uif(uint32_t ui)
 
 /**
  * Convert ubyte to float in [0, 1].
- * XXX a 256-entry lookup table would be slightly faster.
  */
 static inline float
 ubyte_to_float(ubyte ub)
@@ -375,16 +374,16 @@ ubyte_to_float(ubyte ub)
 static inline ubyte
 float_to_ubyte(float f)
 {
-   union fi tmp;
-
-   tmp.f = f;
-   if (tmp.i < 0) {
+   /* return 0 for NaN too */
+   if (!(f > 0.0f)) {
   return (ubyte) 0;
}
-   else if (tmp.i >= 0x3f80 /* 1.0f */) {
+   else if (f >= 1.0f) {
   return (ubyte) 255;
}
else {
+  union fi tmp;
+  tmp.f = f;
   tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
   return (ubyte) tmp.i;
}
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: force draw pipeline if there's more than 65535 vertices

2018-07-21 Thread sroland
From: Roland Scheidegger 

The pt emit path can only handle 65535 - the number of vertices is
truncated to a ushort, resulting in a too small buffer allocation, which
will crash.

Forcing the pipeline path looks suboptimal, then again this bug is
probably there ever since GS is supported, so it seems it's not
happening often. (Note that the vertex_id in the vertex header is 16
bit too, however this is only used by the draw pipeline, and it denotes
the emit vertex nr, and that uses vbuf code, which will only emit smaller
chunks, so should be fine I think.)
Other solutions would be to simply allow 32bit counts for vertex
allocation, however 65535 is already larger than this was intended for
(the idea being it should be more cache friendly). Or could try to teach
the pt emit path to split the emit in smaller chunks (only the non-index
path can be affected, since gs output is always linear), but it's a bit
tricky (we don't know the primitive boundaries up-front).

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=107295

Cc: 
---
 src/gallium/auxiliary/draw/draw_pt_emit.c  |  2 ++
 src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c  | 10 ++
 src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c |  9 +
 3 files changed, 21 insertions(+)

diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c 
b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 6fb630b549..984c76fdf9 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -158,6 +158,7 @@ draw_pt_emit(struct pt_emit *emit,
 */
render->set_primitive(draw->render, prim_info->prim);
 
+   assert(vertex_count <= 65535);
render->allocate_vertices(render,
  (ushort)translate->key.output_stride,
  (ushort)vertex_count);
@@ -229,6 +230,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
 */
render->set_primitive(draw->render, prim_info->prim);
 
+   assert(count <= 65535);
if (!render->allocate_vertices(render,
   (ushort)translate->key.output_stride,
   (ushort)count))
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c 
b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index aa20b918f5..f76e022994 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -299,6 +299,16 @@ fetch_pipeline_generic(struct draw_pt_middle_end *middle,
   FREE(vert_info->verts);
   vert_info = &gs_vert_info;
   prim_info = &gs_prim_info;
+
+  /*
+   * pt emit can only handle ushort number of vertices (see
+   * render->allocate_vertices).
+   * vsplit guarantees there's never more than 4096, however GS can
+   * easily blow this up (by a factor of 256 (or even 1024) max).
+   */
+  if (vert_info->count > 65535) {
+ opt |= PT_PIPELINE;
+  }
} else {
   if (draw_prim_assembler_is_required(draw, prim_info, vert_info)) {
  draw_prim_assembler_run(draw, prim_info, vert_info,
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c 
b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 5e0c562256..91c9360cce 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -428,6 +428,15 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle,
   FREE(vert_info->verts);
   vert_info = &gs_vert_info;
   prim_info = &gs_prim_info;
+  /*
+   * pt emit can only handle ushort number of vertices (see
+   * render->allocate_vertices).
+   * vsplit guarantees there's never more than 4096, however GS can
+   * easily blow this up (by a factor of 256 (or even 1024) max).
+   */
+  if (vert_info->count > 65535) {
+ opt |= PT_PIPELINE;
+  }
} else {
   if (draw_prim_assembler_is_required(draw, prim_info, vert_info)) {
  draw_prim_assembler_run(draw, prim_info, vert_info,
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nir: fix msvc build

2018-07-13 Thread sroland
From: Roland Scheidegger 

Empty initializer braces aren't valid c (it's a gnu extension, and
it's valid in c++).
Hopefully fixes appveyor / msvc build...

Fixes a3150c1d06ae7766c3d3fe3b33432e55c3c7527e
---
 src/compiler/nir/nir_format_convert.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_format_convert.h 
b/src/compiler/nir/nir_format_convert.h
index 33d90f260c..45532b7488 100644
--- a/src/compiler/nir/nir_format_convert.h
+++ b/src/compiler/nir/nir_format_convert.h
@@ -121,7 +121,7 @@ nir_format_bitcast_uint_vec_unmasked(nir_builder *b, 
nir_ssa_def *src,
   DIV_ROUND_UP(src->num_components * src_bits, dst_bits);
assert(dst_components <= 4);
 
-   nir_ssa_def *dst_chan[4] = { };
+   nir_ssa_def *dst_chan[4] = {0};
if (dst_bits > src_bits) {
   unsigned shift = 0;
   unsigned dst_idx = 0;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600/sb: fix crash in fold_alu_op3

2018-07-03 Thread sroland
From: Roland Scheidegger 

fold_assoc() called from fold_alu_op3() can lower the number of src to 2,
which then leads to an invalid access to n.src[2]->gvalue().
This didn't seem to have caused much harm in the past, but on Fedora 28
it will crash (presumably because -D_GLIBCXX_ASSERTIONS is used, although
with libstdc++ 4.8.5 this didn't do anything, -D_GLIBCXX_DEBUG was
needed to show the issue).

An alternative fix would be to instead call fold_alu_op2() from within
fold_assoc() when the number of src is reduced and return always TRUE
from fold_assoc() in this case, with the only actual difference being
the return value from fold_alu_op3() then. I'm not sure what the return
value actually should be in this case (or whether it even can make a
difference).

https://bugs.freedesktop.org/show_bug.cgi?id=106928
Cc: mesa-sta...@lists.freedesktop.org
---
 src/gallium/drivers/r600/sb/sb_expr.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp 
b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 1df78da660..ad798453bc 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -945,6 +945,8 @@ bool expr_handler::fold_alu_op3(alu_node& n) {
if (!sh.safe_math && (n.bc.op_ptr->flags & AF_M_ASSOC)) {
if (fold_assoc(&n))
return true;
+   if (n.src.size() < 3)
+   return fold_alu_op2(n);
}
 
value* v0 = n.src[0]->gvalue();
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] nir/linker: fix msvc build

2018-07-03 Thread sroland
From: Roland Scheidegger 

Empty initializer braces aren't valid c (it's a gnu extension, and
it's valid in c++).
Hopefully fixes appveyor / msvc build...

Fixes 6677e131b806b10754adcb7cf3f427a7fcc2aa09
---
 src/compiler/glsl/gl_nir_link_atomics.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/glsl/gl_nir_link_atomics.c 
b/src/compiler/glsl/gl_nir_link_atomics.c
index da6f5107c9..887ac1b9d0 100644
--- a/src/compiler/glsl/gl_nir_link_atomics.c
+++ b/src/compiler/glsl/gl_nir_link_atomics.c
@@ -175,7 +175,7 @@ gl_nir_link_assign_atomic_counter_resources(struct 
gl_context *ctx,
 struct gl_shader_program *prog)
 {
unsigned num_buffers;
-   unsigned num_atomic_buffers[MESA_SHADER_STAGES] = { };
+   unsigned num_atomic_buffers[MESA_SHADER_STAGES] = {0};
struct active_atomic_buffer *abs =
   find_active_atomic_counters(ctx, prog, &num_buffers);
 
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600: fix copy/paste bug for sampleMaskIn workaround

2018-06-15 Thread sroland
From: Roland Scheidegger 

The sampleMaskIn workaround (b936f4d1ca0d2ab1e828ff6a6e617f12469687fa)
tries to figure out if the shader is running at per-sample frequency, but
there's a typo bug so it will only recognize per-sample linar inputs,
not per-sample perspective ones.

Spotted by Eric Engestrom 
---
 src/gallium/drivers/r600/r600_shader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index c9f2fa6485..c466a48262 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1247,7 +1247,7 @@ static int allocate_system_value_inputs(struct 
r600_shader_ctx *ctx, int gpr_off
tgsi_parse_free(&parse);
 
if (ctx->info.reads_samplemask &&
-   (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) {
+   (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
inputs[1].enabled = true;
}
 
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: improve rasterization discard logic

2018-05-21 Thread sroland
From: Roland Scheidegger 

This unifies the explicit rasterization dicard as well as the implicit
rasterization disabled logic (which we need for another state tracker),
which really should do the exact same thing.
We'll now toss out the prims early on in setup with (implicit or
explicit) discard, rather than do setup and binning with them, which
was entirely pointless.
(We should eventually get rid of implicit discard, which should also
enable us to discard stuff already in draw, hence draw would be
able to skip the pointless clip and fallback stages in this case.)
We still need separate logic for only null ps - this is not the same
as rasterization discard. But simplify the logic there and don't count
primitives simply when there's an empty fs, regardless of depth/stencil
tests, which seems perfectly acceptable by d3d10.
While here, also fix statistics for primitives if face culling is
enabled.
No piglit changes.
---
 src/gallium/drivers/llvmpipe/lp_context.h   |  1 -
 src/gallium/drivers/llvmpipe/lp_jit.c   |  1 +
 src/gallium/drivers/llvmpipe/lp_jit.h   |  5 +++
 src/gallium/drivers/llvmpipe/lp_rast.c  | 12 +++-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h |  6 
 src/gallium/drivers/llvmpipe/lp_scene.c |  5 ++-
 src/gallium/drivers/llvmpipe/lp_scene.h | 10 +++---
 src/gallium/drivers/llvmpipe/lp_setup.c | 18 ++-
 src/gallium/drivers/llvmpipe/lp_setup_line.c| 27 ++--
 src/gallium/drivers/llvmpipe/lp_setup_point.c   | 21 +
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 29 -
 src/gallium/drivers/llvmpipe/lp_setup_vbuf.c|  2 +-
 src/gallium/drivers/llvmpipe/lp_state_derived.c | 22 ++---
 src/gallium/drivers/llvmpipe/lp_state_fs.c  | 41 -
 src/gallium/drivers/llvmpipe/lp_state_fs.h  |  5 ---
 15 files changed, 118 insertions(+), 87 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_context.h 
b/src/gallium/drivers/llvmpipe/lp_context.h
index 54d98fd..7a2f253 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -136,7 +136,6 @@ struct llvmpipe_context {
struct blitter_context *blitter;
 
unsigned tex_timestamp;
-   boolean no_rast;
 
/** List of all fragment shader variants */
struct lp_fs_variant_list_item fs_variants_list;
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c 
b/src/gallium/drivers/llvmpipe/lp_jit.c
index a2762f3..e2309f4 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -212,6 +212,7 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
   elem_types[LP_JIT_THREAD_DATA_CACHE] =
 LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
   elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
+  elem_types[LP_JIT_THREAD_DATA_INVOCATIONS] = LLVMInt64TypeInContext(lc);
   elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
 LLVMInt32TypeInContext(lc);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h 
b/src/gallium/drivers/llvmpipe/lp_jit.h
index 9db26f2..312d1a1 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -192,6 +192,7 @@ struct lp_jit_thread_data
 {
struct lp_build_format_cache *cache;
uint64_t vis_counter;
+   uint64_t ps_invocations;
 
/*
 * Non-interpolated rasterizer state passed through to the fragment shader.
@@ -205,6 +206,7 @@ struct lp_jit_thread_data
 enum {
LP_JIT_THREAD_DATA_CACHE = 0,
LP_JIT_THREAD_DATA_COUNTER,
+   LP_JIT_THREAD_DATA_INVOCATIONS,
LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
LP_JIT_THREAD_DATA_COUNT
 };
@@ -216,6 +218,9 @@ enum {
 #define lp_jit_thread_data_counter(_gallivm, _ptr) \
lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, 
"counter")
 
+#define lp_jit_thread_data_invocations(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_INVOCATIONS, 
"invocs")
+
 #define lp_jit_thread_data_raster_state_viewport_index(_gallivm, _ptr) \
lp_build_struct_get(_gallivm, _ptr, \
LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, \
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c 
b/src/gallium/drivers/llvmpipe/lp_rast.c
index 939944a..9d4f9f8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -107,7 +107,7 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
 task->scene->fb.height - y * TILE_SIZE : TILE_SIZE;
 
task->thread_data.vis_counter = 0;
-   task->ps_invocations = 0;
+   task->thread_data.ps_invocations = 0;
 
for (i = 0; i < task->scene->fb.nr_cbufs; i++) {
   if (task->scene->fb.cbufs[i]) {
@@ -446,10 +446,6 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
 * allocated 4x4 blocks hence need to filter them out here.
 */
if ((x % TILE_S

[Mesa-dev] [PATCH] draw: get rid of special logic to not emit null tris

2018-05-17 Thread sroland
From: Roland Scheidegger 

I've confirmed after 77554d220d6d74b4d913dc37ea3a874e9dc550e4 we no
longer need this to pass some tests from another api (as we no longer
generate the bogus extra null tris in the first place).
---
 src/gallium/auxiliary/draw/draw_pipe_clip.c | 38 -
 1 file changed, 38 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c 
b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 46118b6..2a9c944 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -210,30 +210,6 @@ static void interp(const struct clip_stage *clip,
 }
 
 /**
- * Checks whether the specified triangle is empty and if it is returns
- * true, otherwise returns false.
- * Triangle is considered null/empty if its area is equal to zero.
- */
-static inline boolean
-is_tri_null(const struct clip_stage *clip, const struct prim_header *header)
-{
-   const unsigned pos_attr = clip->pos_attr;
-   float x1 = header->v[1]->data[pos_attr][0] - 
header->v[0]->data[pos_attr][0];
-   float y1 = header->v[1]->data[pos_attr][1] - 
header->v[0]->data[pos_attr][1];
-   float z1 = header->v[1]->data[pos_attr][2] - 
header->v[0]->data[pos_attr][2];
-
-   float x2 = header->v[2]->data[pos_attr][0] - 
header->v[0]->data[pos_attr][0];
-   float y2 = header->v[2]->data[pos_attr][1] - 
header->v[0]->data[pos_attr][1];
-   float z2 = header->v[2]->data[pos_attr][2] - 
header->v[0]->data[pos_attr][2];
-
-   float vx = y1 * z2 - z1 * y2;
-   float vy = x1 * z2 - z1 * x2;
-   float vz = x1 * y2 - y1 * x2;
-
-   return (vx*vx  + vy*vy + vz*vz) == 0.f;
-}
-
-/**
  * Emit a post-clip polygon to the next pipeline stage.  The polygon
  * will be convex and the provoking vertex will always be vertex[0].
  */
@@ -247,7 +223,6 @@ static void emit_poly(struct draw_stage *stage,
struct prim_header header;
unsigned i;
ushort edge_first, edge_middle, edge_last;
-   boolean tri_emitted = FALSE;
 
if (stage->draw->rasterizer->flatshade_first) {
   edge_first  = DRAW_PIPE_EDGE_FLAG_0;
@@ -269,7 +244,6 @@ static void emit_poly(struct draw_stage *stage,
header.pad = 0;
 
for (i = 2; i < n; i++, header.flags = edge_middle) {
-  boolean tri_null;
   /* order the triangle verts to respect the provoking vertex mode */
   if (stage->draw->rasterizer->flatshade_first) {
  header.v[0] = inlist[0];  /* the provoking vertex */
@@ -282,18 +256,6 @@ static void emit_poly(struct draw_stage *stage,
  header.v[2] = inlist[0];  /* the provoking vertex */
   }
 
-  tri_null = is_tri_null(clipper, &header);
-  /*
-   * If we ever generated a tri (regardless if it had area or not),
-   * skip all subsequent null tris.
-   * FIXME: I think this logic was hiding bugs elsewhere. It should
-   * be possible now to always emit all tris.
-   */
-  if (tri_null && tri_emitted) {
- continue;
-  }
-  tri_emitted = TRUE;
-
   if (!edgeflags[i-1]) {
  header.flags &= ~edge_middle;
   }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: Use alloca_undef with array type instead of alloca_array

2018-05-14 Thread sroland
From: Roland Scheidegger 

Use a single allocation of array type instead of the old-style array
allocation for the temp and immediate arrays.
Probably only makes a difference if they aren't used indirectly (so,
if we used them solely because there's too many temps or immediates).
In this case the sroa and early-cse passes can sometimes do some
optimizations which they otherwise cannot.
(As a side note, for the temp reg array, we actually really should
use one allocation per array id, not just one for everything.)
Note that the instcombine pass would actually promote such
allocations to single alloc of array type as well, but it's too late
for some artificial shaders we've seen to help (we don't want to run
instcombine at the beginning due to its cost, hence would need
another sroa/cse pass after instcombine). sroa/early-cse help there
because they can actually eliminate all of the huge shader, reducing
it to a single const output (don't ask...).
(Interestingly, instcombine also removes all the bitcasts we do on that
allocation for single-value gathering, and in the end directly indexes
into the single vector elements, which according to spec is only
semi-valid, but this happens regardless. Another thing instcombine also
does is use inbound GEPs, which is probably something we should do
manually as well - for indirectly indexed reg files llvm may not be
able to figure it out on its own, but we should be able to guarantee
all pointers are always inbound. In any case, by the looks of it
using single allocation with array type seems to be the right thing
to do even for ordinary shaders.)
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 61 +
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index e411f90..83d7dbe 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -741,7 +741,8 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask,
 
assert(lp_check_value(bld_store->type, val));
assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
-   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
+   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val) ||
+  LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(dst_ptr))) == 
LLVMArrayTypeKind);
 
if (exec_mask) {
   LLVMValueRef res, dst;
@@ -852,7 +853,14 @@ get_file_ptr(struct lp_build_tgsi_soa_context *bld,
 
if (bld->indirect_files & (1 << file)) {
   LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, 
index * 4 + chan);
-  return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+  if (LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(var_of_array))) == 
LLVMArrayTypeKind) {
+ LLVMValueRef gep[2];
+ gep[0] = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
+ gep[1] = lindex;
+ return LLVMBuildGEP(builder, var_of_array, gep, 2, "");
+  } else {
+ return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+  }
}
else {
   assert(index <= bld->bld_base.info->file_max[file]);
@@ -1352,21 +1360,20 @@ emit_fetch_immediate(
  /* Gather values from the immediate register array */
  res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
   } else {
- LLVMValueRef lindex = lp_build_const_int32(gallivm,
-reg->Register.Index * 4 + swizzle);
- LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
-bld->imms_array, &lindex, 1, 
"");
+ LLVMValueRef gep[2];
+ gep[0] = lp_build_const_int32(gallivm, 0);
+ gep[1] = lp_build_const_int32(gallivm, reg->Register.Index * 4 + 
swizzle);
+ LLVMValueRef imms_ptr = LLVMBuildGEP(builder,
+  bld->imms_array, gep, 2, "");
  res = LLVMBuildLoad(builder, imms_ptr, "");
 
  if (tgsi_type_is_64bit(stype)) {
-LLVMValueRef lindex1;
 LLVMValueRef imms_ptr2;
 LLVMValueRef res2;
-
-lindex1 = lp_build_const_int32(gallivm,
-   reg->Register.Index * 4 + swizzle + 
1);
+gep[1] = lp_build_const_int32(gallivm,
+  reg->Register.Index * 4 + swizzle + 
1);
 imms_ptr2 = LLVMBuildGEP(builder,
-  bld->imms_array, &lindex1, 1, "");
+ bld->imms_array, gep, 2, "");
 res2 = LLVMBuildLoad(builder, imms_ptr2, "");
 res = emit_fetch_64bit(bld_base, stype, res, res2);
  }
@@ -2957,13 +2964,14 @@ void lp_emit_immediate_soa(
   unsigned index = bld->num_immediates;
   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;

[Mesa-dev] [PATCH] llvmpipe: Fix random number generation for unit tests

2018-05-07 Thread sroland
From: Roland Scheidegger 

We were never producing negative numbers for signed types.
Also fix only producing half the valid range for uint32, and
properly clamp signed values.

Because this now also properly tests snorm with actually negative
values, need to increase eps for such conversions. I believe these
cannot actually be hit in ordinary operation (e.g. if a snorm texture
is sampled and output to snorm RT, it will still go through snorm->float
and float->snorm conversion), so don't bother to do anything to fix
the bad accuracy (might be quite complex).
Basically, the issue is for something like snorm16->snorm8 that in the
end this will just use a 8 bit arithmetic right shift.
But the math behind it says we should actually do a division by 32767 / 127, 
which
is ~258, not 256. So the result can be one bit off (values have too large
magnitude), and furthermore, the shift has incorrect rounding (always rounds
down). For positive numbers, these errors have different direction, but
for negative ones they have the same, hence for some values the error will
be 2 bit in the end.

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=106232
---
 src/gallium/drivers/llvmpipe/lp_test_conv.c |  8 
 src/gallium/drivers/llvmpipe/lp_test_main.c | 13 +++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c 
b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 6e58a03..a4f313a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -211,6 +211,14 @@ test_one(unsigned verbose,
assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 
eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
+   if (dst_type.norm && dst_type.sign && src_type.sign && !src_type.floating) {
+  /*
+   * This is quite inaccurate due to shift being used.
+   * I don't think it's possible to hit such conversions with
+   * llvmpipe though.
+   */
+  eps *= 2;
+   }
 
context = LLVMContextCreate();
gallivm = gallivm_create("test_module", context);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c 
b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 518ca27..5ec0dd3 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -147,6 +147,7 @@ write_elem(struct lp_type type, void *dst, unsigned index, 
double value)
   if(type.sign) {
  long long lvalue = (long long)value;
  lvalue = MIN2(lvalue, ((long long)1 << (type.width - 1)) - 1);
+ lvalue = MAX2(lvalue, -((long long)1 << (type.width - 1)));
  switch(type.width) {
  case 8:
 *((int8_t *)dst + index) = (int8_t)lvalue;
@@ -200,16 +201,24 @@ random_elem(struct lp_type type, void *dst, unsigned 
index)
   }
   else {
  unsigned long long mask;
-if (type.fixed)
+ if (type.fixed)
 mask = ((unsigned long long)1 << (type.width / 2)) - 1;
  else if (type.sign)
 mask = ((unsigned long long)1 << (type.width - 1)) - 1;
  else
 mask = ((unsigned long long)1 << type.width) - 1;
  value += (double)(mask & rand());
+ if (!type.fixed && !type.sign && type.width == 32) {
+/*
+ * rand only returns half the possible range
+ * XXX 64bit values...
+ */
+if(rand() & 1)
+   value += (double)0x8000;
+ }
   }
}
-   if(!type.sign)
+   if(type.sign)
   if(rand() & 1)
  value = -value;
write_elem(type, dst, index, value);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] draw: fix different sign logic when clipping

2018-04-24 Thread sroland
From: Roland Scheidegger 

The logic was flawed, since mul(x,y) will be <= 0 (exactly 0) when
the sign is the same but both numbers are sufficiently small
(if the product is smaller than 2^-128).
This could apparently lead to emitting a sufficient amount of
additional bogus vertices to overflow the allocated array for them,
hitting an assertion (still safe with release builds since we just
aborted clipping after the assertion in this case - I'm however unsure
if this is now really no longer possible, so that code stays).
Not sure if the additional vertices could cause other grief, I didn't
see anything wrong even when hitting the assertion.

Essentially, both +-0 are treated as positive (the vertex is considered
to be inside the clip volume for this plane), so integrate the logic
determining different sign into the branch there.
---
 src/gallium/auxiliary/draw/draw_pipe_clip.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c 
b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index b7a1b5c..6af5c09 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -47,11 +47,6 @@
 /** Set to 1 to enable printing of coords before/after clipping */
 #define DEBUG_CLIP 0
 
-
-#ifndef DIFFERENT_SIGNS
-#define DIFFERENT_SIGNS(x, y) ((x) * (y) <= 0.0F && (x) - (y) != 0.0F)
-#endif
-
 #define MAX_CLIPPED_VERTICES ((2 * (6 + PIPE_MAX_CLIP_PLANES))+1)
 
 
@@ -479,6 +474,7 @@ do_clip_tri(struct draw_stage *stage,
   for (i = 1; i <= n; i++) {
  struct vertex_header *vert = inlist[i];
  boolean *edge = &inEdges[i];
+ boolean different_sign;
 
  float dp = getclipdist(clipper, vert, plane_idx);
 
@@ -491,9 +487,12 @@ do_clip_tri(struct draw_stage *stage,
return;
 outEdges[outcount] = *edge_prev;
 outlist[outcount++] = vert_prev;
+different_sign = dp < 0.0f;
+ } else {
+different_sign = !(dp < 0.0f);
  }
 
- if (DIFFERENT_SIGNS(dp, dp_prev)) {
+ if (different_sign) {
 struct vertex_header *new_vert;
 boolean *new_edge;
 
@@ -511,7 +510,7 @@ do_clip_tri(struct draw_stage *stage,
 
 if (dp < 0.0f) {
/* Going out of bounds.  Avoid division by zero as we
-* know dp != dp_prev from DIFFERENT_SIGNS, above.
+* know dp != dp_prev from different_sign, above.
 */
float t = dp / (dp - dp_prev);
interp( clipper, new_vert, t, vert, vert_prev, viewport_index );
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] draw: simplify clip null tri logic

2018-04-24 Thread sroland
From: Roland Scheidegger 

Simplifies the logic when to emit null tris (albeit the reasons why we
have to do this remain unclear).
This is strictly just logic simplification, the behavior doesn't change
at all.
---
 src/gallium/auxiliary/draw/draw_pipe_clip.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c 
b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 4cfa54b..b7a1b5c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -253,7 +253,7 @@ static void emit_poly(struct draw_stage *stage,
unsigned i;
ushort edge_first, edge_middle, edge_last;
boolean last_tri_was_null = FALSE;
-   boolean tri_was_not_null = FALSE;
+   boolean tri_emitted = FALSE;
 
if (stage->draw->rasterizer->flatshade_first) {
   edge_first  = DRAW_PIPE_EDGE_FLAG_0;
@@ -289,17 +289,16 @@ static void emit_poly(struct draw_stage *stage,
   }
 
   tri_null = is_tri_null(clipper, &header);
-  /* If we generated a triangle with an area, aka. non-null triangle,
-   * or if the previous triangle was also null then skip all subsequent
-   * null triangles */
-  if ((tri_was_not_null && tri_null) || (last_tri_was_null && tri_null)) {
- last_tri_was_null = tri_null;
+  /*
+   * If we ever generated a tri (regardless if it had area or not),
+   * skip all subsequent null tris.
+   * FIXME: it is unclear why we always have to emit at least one
+   * tri. Maybe this is hiding bugs elsewhere.
+   */
+  if (tri_null && tri_emitted) {
  continue;
   }
-  last_tri_was_null = tri_null;
-  if (!tri_null) {
- tri_was_not_null = TRUE;
-  }
+  tri_emitted = TRUE;
 
   if (!edgeflags[i-1]) {
  header.flags &= ~edge_middle;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] gallivm: dump bitcode before optimization

2018-04-22 Thread sroland
From: Roland Scheidegger 

If we dump the bitcode for off-line debug purposes, we really want the
pre-optimized bitcode, otherwise it's useless in identifying problems
with IR optimization (if you have a shader which takes an hour to do
IR optimization, it's also nice you don't have to wait that hour...).
Also, print out the function passes for opt which correspond to what
was used for jit compilation (and also the opt level for codegen).
Using opt/llc this way should then pretty much mimic what was done
for jit. (When specifying something like -time-passes
-debug-pass=[Structure|Arguments] (for either opt or llc) that also
gives very useful information in which passes all the time was spent,
and which passes are really run along with the order - llvm will add
passes due to dependencies on its own, and of course -O2 for llc
comes with a ~100 pass list.)
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 35 +
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index d0afff1..41d828c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -142,6 +142,10 @@ create_pass_manager(struct gallivm_state *gallivm)
* TODO: Evaluate passes some more - keeping in mind
* both quality of generated code and compile times.
*/
+  /*
+   * NOTE: if you change this, don't forget to change the output
+   * with GALLIVM_DEBUG_DUMP_BC in gallivm_compile_module.
+   */
   LLVMAddScalarReplAggregatesPass(gallivm->passmgr);
   LLVMAddEarlyCSEPass(gallivm->passmgr);
   LLVMAddCFGSimplificationPass(gallivm->passmgr);
@@ -151,7 +155,7 @@ create_pass_manager(struct gallivm_state *gallivm)
* due to licm implying lcssa (since llvm 3.5), which can take forever.
* Even for sane shaders, the cost of licm is rather high (and not just
* due to lcssa, licm itself too), though mostly only in cases when it
-   * can actually move things, so having to disable it is a pity.   
+   * can actually move things, so having to disable it is a pity.
* LLVMAddLICMPass(gallivm->passmgr);
*/
   LLVMAddReassociatePass(gallivm->passmgr);
@@ -597,6 +601,22 @@ gallivm_compile_module(struct gallivm_state *gallivm)
   gallivm->builder = NULL;
}
 
+   /* Dump bitcode to a file */
+   if (gallivm_debug & GALLIVM_DEBUG_DUMP_BC) {
+  char filename[256];
+  assert(gallivm->module_name);
+  util_snprintf(filename, sizeof(filename), "ir_%s.bc", 
gallivm->module_name);
+  LLVMWriteBitcodeToFile(gallivm->module, filename);
+  debug_printf("%s written\n", filename);
+  debug_printf("Invoke as \"opt %s %s | llc -O%d %s%s\"\n",
+   gallivm_debug & GALLIVM_DEBUG_NO_OPT ? "-mem2reg" :
+   "-sroa -early-cse -simplifycfg -reassociate "
+   "-mem2reg -constprop -instcombine -gvn",
+   filename, gallivm_debug & GALLIVM_DEBUG_NO_OPT ? 0 : 2,
+   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
+   "[-mattr=<-mattr option(s)>]");
+   }
+
if (gallivm_debug & GALLIVM_DEBUG_PERF)
   time_begin = os_time_get();
 
@@ -630,19 +650,6 @@ gallivm_compile_module(struct gallivm_state *gallivm)
gallivm->module_name, time_msec);
}
 
-   /* Dump byte code to a file */
-   if (gallivm_debug & GALLIVM_DEBUG_DUMP_BC) {
-  char filename[256];
-  assert(gallivm->module_name);
-  util_snprintf(filename, sizeof(filename), "ir_%s.bc", 
gallivm->module_name);
-  LLVMWriteBitcodeToFile(gallivm->module, filename);
-  debug_printf("%s written\n", filename);
-  debug_printf("Invoke as \"llc %s%s -o - %s\"\n",
-   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
-   "[-mattr=<-mattr option(s)>]",
-   filename);
-   }
-
if (use_mcjit) {
   /* Setting the module's DataLayout to an empty string will cause the
* ExecutionEngine to copy to the DataLayout string from its target
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] gallivm: (trivial) do division by 1000 with int64

2018-04-22 Thread sroland
From: Roland Scheidegger 

Conversion to int can otherwise overflow if compile times are over
~71min. (Yes this can happen...)
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index abca624..d0afff1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -624,7 +624,7 @@ gallivm_compile_module(struct gallivm_state *gallivm)
 
if (gallivm_debug & GALLIVM_DEBUG_PERF) {
   int64_t time_end = os_time_get();
-  int time_msec = (int)(time_end - time_begin) / 1000;
+  int time_msec = (int)((time_end - time_begin) / 1000);
   assert(gallivm->module_name);
   debug_printf("optimizing module %s took %d msec\n",
gallivm->module_name, time_msec);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] gallivm: remove LICM pass

2018-04-22 Thread sroland
From: Roland Scheidegger 

LICM is simply too expensive, even though it presumably can help quite
a bit in some cases.
It was definitely cheaper in llvm 3.3, though as far as I can tell with
llvm 3.3 it failed to do anything in most cases. early-cse also actually
seems to cause licm to be able to move things when it previously couldn't,
which causes noticeable compile time increases.
There's more loop passes in llvm, but I'm not sure which ones are helpful,
and I couldn't find anything which would roughly do what the old licm in
llvm 3.3 did, so ditch it.
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 05a74a0..abca624 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -145,7 +145,15 @@ create_pass_manager(struct gallivm_state *gallivm)
   LLVMAddScalarReplAggregatesPass(gallivm->passmgr);
   LLVMAddEarlyCSEPass(gallivm->passmgr);
   LLVMAddCFGSimplificationPass(gallivm->passmgr);
-  LLVMAddLICMPass(gallivm->passmgr);
+  /* 
+   * FIXME: LICM is potentially quite useful. However, for some
+   * rather crazy shaders the compile time can reach _hours_ per shader,
+   * due to licm implying lcssa (since llvm 3.5), which can take forever.
+   * Even for sane shaders, the cost of licm is rather high (and not just
+   * due to lcssa, licm itself too), though mostly only in cases when it
+   * can actually move things, so having to disable it is a pity.   
+   * LLVMAddLICMPass(gallivm->passmgr);
+   */
   LLVMAddReassociatePass(gallivm->passmgr);
   LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr);
   LLVMAddConstantPropagationPass(gallivm->passmgr);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] gallivm: add early cse pass

2018-04-22 Thread sroland
From: Roland Scheidegger 

This pass is quite cheap, and can simplify the IR quite a bit for our
generated IR.
In particular on a variety of shaders I've found the time saved by
other passes due to the simplified IR more than makes up for the cost
of this pass, and on top of that the end result is actually better.
The only downside I've found is this enables the LICM pass to move some
things out of the main shader loop (in the case I've seen, instanced
vertex fetch (which is constant within the jit shader) plus the derived
instructions in the shader) which it couldn't do before for some reason.
This would actually be desirable but can increase compile time
considerably (licm seems to have considerable cost when it actually can
move things out of loops, due to alias analysis). But blaming early cse
for this seems inappropriate. (Note that the first two sroa / earlycse
passes are similar to what a standard llvm opt -O1/-O2 pipeline would
do, albeit this has some more passes even before but I don't think
they'd do much for us.)
It also in particular helps some crazy shader used for driver
verification (don't ask...) a lot (about factor of 6 faster in compile
time) (due to simplfiying the ir before LICM is run).
While here, also move licm behind simplifycfg. For some shaders there
seems to be very significant compile time gains (we've seen a factor
of 1 albeit that was a really crazy shader you'd certainly never
see in a real app), beause LICM is quite expensive and there's cases
where running simplifycfg (along with sroa and early-cse) before licm
reduces IR complexity significantly. (I'm not entirely sure if it would
make sense to also run it afterwards.)
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index ea5489b..05a74a0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -138,13 +138,14 @@ create_pass_manager(struct gallivm_state *gallivm)
}
 
if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
-  /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
-   * but there are more on SVN.
-   * TODO: Add more passes.
+  /*
+   * TODO: Evaluate passes some more - keeping in mind
+   * both quality of generated code and compile times.
*/
   LLVMAddScalarReplAggregatesPass(gallivm->passmgr);
-  LLVMAddLICMPass(gallivm->passmgr);
+  LLVMAddEarlyCSEPass(gallivm->passmgr);
   LLVMAddCFGSimplificationPass(gallivm->passmgr);
+  LLVMAddLICMPass(gallivm->passmgr);
   LLVMAddReassociatePass(gallivm->passmgr);
   LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr);
   LLVMAddConstantPropagationPass(gallivm->passmgr);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600: fix abs for op3 sources

2018-03-12 Thread sroland
From: Roland Scheidegger 

If a src was referencing the same temp as the dst, the per-component
copy code didn't work.
e.g.
  cndge r0.xy, r0.xx, |r2|, r3
got expanded into
  mov  r12.x, |r2|
  cndge r0.x, r0.x, r12, r3
  mov  r12.y, |r2|
  cndge r0.y, r0.x, r12, r3
hence for the second cndge r0.x was mistakenly the previous cndge result.
Fix this by doing all the movs first, so there's no bogus alu.last in between.

Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=102905
---
 src/gallium/drivers/r600/r600_shader.c | 110 +
 1 file changed, 56 insertions(+), 54 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 6b5c42f86d..bd511c76ac 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -7076,33 +7076,42 @@ static int tgsi_helper_copy(struct r600_shader_ctx 
*ctx, struct tgsi_full_instru
 }
 
 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
- unsigned temp, int chan,
+ unsigned writemask,
  struct r600_bytecode_alu_src *bc_src,
  const struct r600_shader_src *shader_src)
 {
struct r600_bytecode_alu alu;
-   int r;
+   int i, r;
+   int lasti = tgsi_last_instruction(writemask);
+   int temp_reg = 0;
 
-   r600_bytecode_src(bc_src, shader_src, chan);
+   r600_bytecode_src(&bc_src[0], shader_src, 0);
+   r600_bytecode_src(&bc_src[1], shader_src, 1);
+   r600_bytecode_src(&bc_src[2], shader_src, 2);
+   r600_bytecode_src(&bc_src[3], shader_src, 3);
 
-   /* op3 operands don't support abs modifier */
if (bc_src->abs) {
-   assert(temp!=0);  /* we actually need the extra register, 
make sure it is allocated. */
-   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-   alu.op = ALU_OP1_MOV;
-   alu.dst.sel = temp;
-   alu.dst.chan = chan;
-   alu.dst.write = 1;
+   temp_reg = r600_get_temp(ctx);
 
-   alu.src[0] = *bc_src;
-   alu.last = true; // sufficient?
-   r = r600_bytecode_add_alu(ctx->bc, &alu);
-   if (r)
-   return r;
-
-   memset(bc_src, 0, sizeof(*bc_src));
-   bc_src->sel = temp;
-   bc_src->chan = chan;
+   for (i = 0; i < lasti + 1; i++) {
+   if (!(writemask & (1 << i)))
+   continue;
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP1_MOV;
+   alu.dst.sel = temp_reg;
+   alu.dst.chan = i;
+   alu.dst.write = 1;
+   alu.src[0] = bc_src[i];
+   if (i == lasti) {
+   alu.last = 1;
+   }
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+   memset(&bc_src[i], 0, sizeof(*bc_src));
+   bc_src[i].sel = temp_reg;
+   bc_src[i].chan = i;
+   }
}
return 0;
 }
@@ -7111,9 +7120,9 @@ static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int 
dst)
 {
struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
struct r600_bytecode_alu alu;
+   struct r600_bytecode_alu_src srcs[4][4];
int i, j, r;
int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
-   int temp_regs[4];
unsigned op = ctx->inst_info->op;
 
if (op == ALU_OP3_MULADD_IEEE &&
@@ -7121,10 +7130,12 @@ static int tgsi_op3_dst(struct r600_shader_ctx *ctx, 
int dst)
op = ALU_OP3_MULADD;
 
for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-   temp_regs[j] = 0;
-   if (ctx->src[j].abs)
-   temp_regs[j] = r600_get_temp(ctx);
+   r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
+ srcs[j], &ctx->src[j]);
+   if (r)
+   return r;
}
+
for (i = 0; i < lasti + 1; i++) {
if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
continue;
@@ -7132,9 +7143,7 @@ static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int 
dst)
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
alu.op = op;
for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
-   r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, 
&alu.src[j], &ctx->src[j]);
-   if (r)
-   return r;
+   alu.src[j] = srcs[j][i]

[Mesa-dev] [PATCH] u_blit: (trivial) u_blit.h needs to include p_defines.h

2018-03-09 Thread sroland
From: Roland Scheidegger 

(For the pipe_tex_filter enum)
---
 src/gallium/auxiliary/util/u_blit.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/util/u_blit.h 
b/src/gallium/auxiliary/util/u_blit.h
index 085ea63..004ceae 100644
--- a/src/gallium/auxiliary/util/u_blit.h
+++ b/src/gallium/auxiliary/util/u_blit.h
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef __cplusplus
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: fix alpha value for very short aa lines

2018-03-08 Thread sroland
From: Roland Scheidegger 

The logic would not work correctly for line lengths smaller than 1.0,
even a degenerated line with length 0 would still produce a fragment
with anyhwere between alpha 0.0 and 0.5.
---
 src/gallium/auxiliary/draw/draw_pipe_aaline.c  | 25 -
 src/gallium/auxiliary/draw/draw_pipe_stipple.c |  1 -
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c 
b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 14a4b2f..66a943a 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -370,7 +370,30 @@ aaline_line(struct draw_stage *stage, struct prim_header 
*header)
float t_l, t_w;
uint i;
 
-   half_length = 0.5f * sqrtf(dx * dx + dy * dy) + 0.5f;
+   half_length = 0.5f * sqrtf(dx * dx + dy * dy);
+
+   if (half_length < 0.5f) {
+  /*
+   * The logic we use for "normal" sized segments is incorrect
+   * for very short segments (basically because we only have
+   * one value to interpolate, not a distance to each endpoint).
+   * Therefore, we calculate half_length differently, so that for
+   * original line length (near) 0, we get alpha 0 - otherwise
+   * max alpha would still be 0.5. This also prevents us from
+   * artifacts due to degenerated lines (the endpoints being
+   * identical, which would still receive anywhere from alpha
+   * 0-0.5 otherwise) (at least the pstipple stage may generate
+   * such lines due to float inaccuracies if line length is very
+   * close to a integer).
+   * Might not be fully accurate neither (because the "strength" of
+   * the line is going to be determined by how close to the pixel
+   * center those 1 or 2 fragments are) but it's probably the best
+   * we can do.
+   */
+  half_length = 2.0f * half_length;
+   } else {
+  half_length = half_length + 0.5f;
+   }
 
t_w = half_width;
t_l = 0.5f;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c 
b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 3a44e96..d30572c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -150,7 +150,6 @@ stipple_line(struct draw_stage *stage, struct prim_header 
*header)
if (header->flags & DRAW_PIPE_RESET_STIPPLE)
   stipple->counter = 0;
 
-
/* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table.
 */
for (i = 0; i < length; i++) {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] draw: fix line stippling with aa lines

2018-03-06 Thread sroland
From: Roland Scheidegger 

In contrast to non-aa, where stippling is based on either dx or dy
(depending on if it's a x or y major line), stippling is based on
actual distance with smooth lines, so adjust for this.

(It looks like there's some minor artifacts with mesa demos
line-sample with wide lines, I think there might be some issues
with wide lines and very short line segments (when the original
line segment length is below half a pixel) but it may be related
to aa lines rather than stippling.)
---
 src/gallium/auxiliary/draw/draw_pipe_stipple.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c 
b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 3a84d6c..8fa8274 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -50,6 +50,7 @@ struct stipple_stage {
float counter;
uint pattern;
uint factor;
+   bool smooth;
 };
 
 
@@ -136,9 +137,15 @@ stipple_line(struct draw_stage *stage, struct prim_header 
*header)
float dx = x0 > x1 ? x0 - x1 : x1 - x0;
float dy = y0 > y1 ? y0 - y1 : y1 - y0;
 
-   float length = MAX2(dx, dy);
+   float length;
int i;
 
+   if (stipple->smooth) {
+  length = sqrtf(dx*dx + dy*dy);
+   } else {
+  length = MAX2(dx, dy);
+   }
+
if (header->flags & DRAW_PIPE_RESET_STIPPLE)
   stipple->counter = 0;
 
@@ -205,6 +212,7 @@ stipple_first_line(struct draw_stage *stage,
 
stipple->pattern = draw->rasterizer->line_stipple_pattern;
stipple->factor = draw->rasterizer->line_stipple_factor + 1;
+   stipple->smooth = draw->rasterizer->line_smooth;
 
stage->line = stipple_line;
stage->line(stage, header);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] draw: simplify (and correct) aaline fallback (v2)

2018-03-06 Thread sroland
From: Roland Scheidegger 

The motivation actually was to get rid of the additional tex
instruction, since that requires the draw fallback code to intercept
all sampler / view calls (even if the fallback is never hit).
Basically, the idea is to use coverage of the pixel to calculate
the alpha value, and coverage is simply based on the distance
to the center of the line (in both line direction, which is useful
for wide lines, as well as perpendicular to the line).
This is much closer to what hw supporting this natively actually does.
It also fixes an issue with line width not quite being correct, as
well as endpoints getting stretched too far (in line direction) with
wide lines, which is apparent with mesa demo line-width.
(For llvmpipe, it would probably make sense to do something like this
directly when drawing lines, since rendering two tris is twice as
expensive as a line, but it would need some changes with state
management.)
Since we're no longer relying on mipmapping to get the alpha value,
we also don't need to draw 3 rects (6 tris), one is sufficient.

There's still issues (as before):
- quite sure it's not correct without half_pixel_center, but can't test
this with GL.
- aaline + line stipple is incorrect (evident with line-width demo).
Looking at the spec the stipple pattern should actually be based on
distance (not just dx or dy for x/y major lines as without aa).
- outputs (other than pos + the one used for line aa) should be
reinterpolated since we actually increase line length by half a pixel
(but there's no tests which would care).

v2: simplify the math (should be equivalent), don't need immediate
---
 src/gallium/auxiliary/draw/draw_pipe_aaline.c | 504 +-
 1 file changed, 100 insertions(+), 404 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c 
b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index a859dbc..591e2a3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -1,6 +1,6 @@
 /**
  *
- * Copyright 2007 VMware, Inc.
+ * Copyright 2007-2018 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,7 +26,7 @@
  **/
 
 /**
- * AA line stage:  AA lines are converted to texture mapped triangles.
+ * AA line stage:  AA lines are converted triangles (with extra generic)
  *
  * Authors:  Brian Paul
  */
@@ -40,7 +40,6 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_sampler.h"
 
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
@@ -55,19 +54,6 @@
 
 
 /**
- * Size for the alpha texture used for antialiasing
- */
-#define TEXTURE_SIZE_LOG2  5   /* 32 x 32 */
-
-/**
- * Max texture level for the alpha texture used for antialiasing
- *
- * Don't use the 1x1 and 2x2 mipmap levels.
- */
-#define MAX_TEXTURE_LEVEL  (TEXTURE_SIZE_LOG2 - 2)
-
-
-/**
  * Subclass of pipe_shader_state to carry extra fragment shader info.
  */
 struct aaline_fragment_shader
@@ -75,8 +61,7 @@ struct aaline_fragment_shader
struct pipe_shader_state state;
void *driver_fs;
void *aaline_fs;
-   uint sampler_unit;
-   int generic_attrib;  /**< texcoord/generic used for texture */
+   int generic_attrib;  /**< generic used for distance */
 };
 
 
@@ -89,26 +74,16 @@ struct aaline_stage
 
float half_line_width;
 
-   /** For AA lines, this is the vertex attrib slot for the new texcoords */
-   uint tex_slot;
+   /** For AA lines, this is the vertex attrib slot for new generic */
+   uint coord_slot;
/** position, not necessarily output zero */
uint pos_slot;
 
-   void *sampler_cso;
-   struct pipe_resource *texture;
-   struct pipe_sampler_view *sampler_view;
-   uint num_samplers;
-   uint num_sampler_views;
-
 
/*
 * Currently bound state
 */
struct aaline_fragment_shader *fs;
-   struct {
-  void *sampler[PIPE_MAX_SAMPLERS];
-  struct pipe_sampler_view *sampler_views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   } state;
 
/*
 * Driver interface/override functions
@@ -117,15 +92,6 @@ struct aaline_stage
 const struct pipe_shader_state *);
void (*driver_bind_fs_state)(struct pipe_context *, void *);
void (*driver_delete_fs_state)(struct pipe_context *, void *);
-
-   void (*driver_bind_sampler_states)(struct pipe_context *,
-  enum pipe_shader_type, unsigned,
-  unsigned, void **);
-
-   void (*driver_set_sampler_views)(struct pipe_context *,
-enum pipe_shader_type shader,
-unsigned start, unsigned count,
-struct pipe_sampler_view **);
 };
 
 
@@ -136,41 +102,27 @@ struct aaline_sta

[Mesa-dev] [PATCH] draw: simplify (and correct) aaline fallback

2018-03-06 Thread sroland
From: Roland Scheidegger 

The motivation actually was to get rid of the additional tex
instruction, since that requires the draw fallback code to intercept
all sampler / view calls (even if the fallback is never hit).
Basically, the idea is to use coverage of the pixel to calculate
the alpha value, and coverage is simply based on the distance
to the center of the line (in both line direction, which is useful
for wide lines, as well as perpendicular to the line).
This is much closer to what hw supporting this natively actually does.
It also fixes an issue with line width not quite being correct, as
well as endpoints getting stretched too far (in line direction) with
wide lines, which is apparent with mesa demo line-width.
(For llvmpipe, it would probably make sense to do something like this
directly when drawing lines, since rendering two tris is twice as
expensive as a line, but it would need some changes with state
management.)
Since we're no longer relying on mipmapping to get the alpha value,
we also don't need to draw 3 rects (6 tris), one is sufficient.

There's still issues (as before):
- quite sure it's not correct without half_pixel_center, but can't test
this with GL.
- aaline + line stipple is incorrect (evident with line-width demo).
Looking at the spec the stipple pattern should actually be based on
distance (not just dx or dy for x/y major lines as without aa).
- outputs (other than pos + the one used for line aa) should be
reinterpolated since we actually increase line length by half a pixel
(but there's no tests which would care).
---
 src/gallium/auxiliary/draw/draw_pipe_aaline.c | 532 +++---
 1 file changed, 131 insertions(+), 401 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c 
b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index a859dbc..b490a50 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -1,6 +1,6 @@
 /**
  *
- * Copyright 2007 VMware, Inc.
+ * Copyright 2007-2018 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,7 +26,7 @@
  **/
 
 /**
- * AA line stage:  AA lines are converted to texture mapped triangles.
+ * AA line stage:  AA lines are converted triangles (with extra generic)
  *
  * Authors:  Brian Paul
  */
@@ -40,7 +40,6 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "util/u_sampler.h"
 
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
@@ -55,19 +54,6 @@
 
 
 /**
- * Size for the alpha texture used for antialiasing
- */
-#define TEXTURE_SIZE_LOG2  5   /* 32 x 32 */
-
-/**
- * Max texture level for the alpha texture used for antialiasing
- *
- * Don't use the 1x1 and 2x2 mipmap levels.
- */
-#define MAX_TEXTURE_LEVEL  (TEXTURE_SIZE_LOG2 - 2)
-
-
-/**
  * Subclass of pipe_shader_state to carry extra fragment shader info.
  */
 struct aaline_fragment_shader
@@ -75,8 +61,7 @@ struct aaline_fragment_shader
struct pipe_shader_state state;
void *driver_fs;
void *aaline_fs;
-   uint sampler_unit;
-   int generic_attrib;  /**< texcoord/generic used for texture */
+   int generic_attrib;  /**< generic used for distance */
 };
 
 
@@ -89,26 +74,16 @@ struct aaline_stage
 
float half_line_width;
 
-   /** For AA lines, this is the vertex attrib slot for the new texcoords */
-   uint tex_slot;
+   /** For AA lines, this is the vertex attrib slot for new generic */
+   uint coord_slot;
/** position, not necessarily output zero */
uint pos_slot;
 
-   void *sampler_cso;
-   struct pipe_resource *texture;
-   struct pipe_sampler_view *sampler_view;
-   uint num_samplers;
-   uint num_sampler_views;
-
 
/*
 * Currently bound state
 */
struct aaline_fragment_shader *fs;
-   struct {
-  void *sampler[PIPE_MAX_SAMPLERS];
-  struct pipe_sampler_view *sampler_views[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-   } state;
 
/*
 * Driver interface/override functions
@@ -117,15 +92,6 @@ struct aaline_stage
 const struct pipe_shader_state *);
void (*driver_bind_fs_state)(struct pipe_context *, void *);
void (*driver_delete_fs_state)(struct pipe_context *, void *);
-
-   void (*driver_bind_sampler_states)(struct pipe_context *,
-  enum pipe_shader_type, unsigned,
-  unsigned, void **);
-
-   void (*driver_set_sampler_views)(struct pipe_context *,
-enum pipe_shader_type shader,
-unsigned start, unsigned count,
-struct pipe_sampler_view **);
 };
 
 
@@ -136,41 +102,38 @@ struct aaline_stage
  */
 struct aa_transform_context {
struct tgsi_transform_con

[Mesa-dev] [PATCH] tgsi/scan: use wrap-around shift behavior explicitly for file_mask

2018-03-01 Thread sroland
From: Roland Scheidegger 

The comment said it will only represent the lowest 32 regs. This was
not entirely true in practice, since at least on x86 you'll get
masked shifts (unless the compiler could recognize it already and toss
it out). It turns out this actually works out alright (presumably
noone uses it for temp regs) when increasing max sampler views, so
make that behavior explicit.
Albeit it feels a bit hacky (but in any case, explicit behavior there
is better than undefined behavior).
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 7 +--
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 7 ++-
 src/gallium/drivers/swr/swr_shader.cpp | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c 
b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index c35eff2..0d229c9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -585,8 +585,11 @@ scan_declaration(struct tgsi_shader_info *info,
   int buffer;
   unsigned index, target, type;
 
-  /* only first 32 regs will appear in this bitfield */
-  info->file_mask[file] |= (1 << reg);
+  /*
+   * only first 32 regs will appear in this bitfield, if larger
+   * bits will wrap around.
+   */
+  info->file_mask[file] |= (1 << (reg & 31));
   info->file_count[file]++;
   info->file_max[file] = MAX2(info->file_max[file], (int)reg);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 603fd84..48c004c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -3323,7 +3323,12 @@ make_variant_key(struct llvmpipe_context *lp,
if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
   key->nr_sampler_views = 
shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
   for(i = 0; i < key->nr_sampler_views; ++i) {
- if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+ /*
+  * Note sview may exceed what's representable by file_mask.
+  * This will still work, the only downside is that not actually
+  * used views may be included in the shader key.
+  */
+ if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << (i & 
31))) {
 lp_sampler_static_texture_state(&key->state[i].texture_state,
 
lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
  }
diff --git a/src/gallium/drivers/swr/swr_shader.cpp 
b/src/gallium/drivers/swr/swr_shader.cpp
index e5fb679..fa1c0b8 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -98,7 +98,7 @@ swr_generate_sampler_key(const struct lp_tgsi_info &info,
   key.nr_sampler_views =
  info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
   for (unsigned i = 0; i < key.nr_sampler_views; i++) {
- if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+ if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << (i & 31))) {
 const struct pipe_sampler_view *view =
ctx->sampler_views[shader_type][i];
 lp_sampler_static_texture_state(
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] cso: don't cycle through PIPE_MAX_SHADER_SAMPLER_VIEWS on context destroy

2018-02-27 Thread sroland
From: Roland Scheidegger 

There's no point, we know the highest non-null one.
---
 src/gallium/auxiliary/cso_cache/cso_context.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c 
b/src/gallium/auxiliary/cso_cache/cso_context.c
index 1b5d4b5..3fa57f1 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -407,8 +407,10 @@ void cso_destroy_context( struct cso_context *ctx )
  ctx->pipe->set_stream_output_targets(ctx->pipe, 0, NULL, NULL);
}
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+   for (i = 0; i < ctx->nr_fragment_views; i++) {
   pipe_sampler_view_reference(&ctx->fragment_views[i], NULL);
+   }
+   for (i = 0; i < ctx->nr_fragment_views_saved; i++) {
   pipe_sampler_view_reference(&ctx->fragment_views_saved[i], NULL);
}
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] softpipe: don't iterate through PIPE_MAX_SHADER_SAMPLER_VIEWS

2018-02-27 Thread sroland
From: Roland Scheidegger 

We were setting view to NULL if the iteration was larger than i.
But in fact if the view is NULL the code did nothing anyway...
---
 src/gallium/drivers/softpipe/sp_state_sampler.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c 
b/src/gallium/drivers/softpipe/sp_state_sampler.c
index c10fd91..751eb76 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -181,8 +181,8 @@ prepare_shader_sampling(
if (!num)
   return;
 
-   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
-  struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+   for (i = 0; i < num; i++) {
+  struct pipe_sampler_view *view = views[i];
 
   if (view) {
  struct pipe_resource *tex = view->texture;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] RFC: gallium: increase PIPE_MAX_SHADER_SAMPLER_VIEWS to 128

2018-02-26 Thread sroland
From: Roland Scheidegger 

Some state trackers require 128.
(There are no plans to increase PIPE_MAX_SAMPLERS too, since with gl
state tracker it's unlikely more than 32 will be needed, if you need
more use bindless.)
---
 src/gallium/include/pipe/p_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_state.h 
b/src/gallium/include/pipe/p_state.h
index 2b56d60..cddb3b4 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -64,7 +64,7 @@ extern "C" {
 #define PIPE_MAX_SAMPLERS 32
 #define PIPE_MAX_SHADER_INPUTS80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_OUTPUTS   80 /* 32 GENERIC + 32 PATCH + 16 others */
-#define PIPE_MAX_SHADER_SAMPLER_VIEWS 32
+#define PIPE_MAX_SHADER_SAMPLER_VIEWS 128
 #define PIPE_MAX_SHADER_BUFFERS   32
 #define PIPE_MAX_SHADER_IMAGES32
 #define PIPE_MAX_TEXTURE_LEVELS   16
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: don't needlessly iterate through all sampler view slots

2018-02-26 Thread sroland
From: Roland Scheidegger 

We already stored the highest (potentially) used number.
---
 src/gallium/auxiliary/draw/draw_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index 9791ec5..e887272 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -973,7 +973,7 @@ draw_set_sampler_views(struct draw_context *draw,
 
for (i = 0; i < num; ++i)
   draw->sampler_views[shader_stage][i] = views[i];
-   for (i = num; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; ++i)
+   for (i = num; i < draw->num_sampler_views[shader_stage]; ++i)
   draw->sampler_views[shader_stage][i] = NULL;
 
draw->num_sampler_views[shader_stage] = num;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] tgsi: Recognize RET in main for tgsi_transform

2018-02-13 Thread sroland
From: Roland Scheidegger 

Shaders coming from dx10 state trackers have a RET before the END.
And the epilog needs to be placed before the RET (otherwise it will
get ignored).
Hence figure out if a RET is in main, in this case we'll place
the epilog there rather than before the END.
(At a closer look, there actually seem to be problems with control
flow in general with output redirection, that would need another
look. It's enough however to fix draw's aa line emulation in some
internal bug - lines tend to be drawn with trivial shaders, moving
either a constant color or a vertex color directly to the output).

v2: add assert so buggy handling of RET in main is detected
---
 src/gallium/auxiliary/tgsi/tgsi_transform.c | 62 +
 1 file changed, 55 insertions(+), 7 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c 
b/src/gallium/auxiliary/tgsi/tgsi_transform.c
index ffdad13..a13cf90 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -110,6 +110,9 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
 {
uint procType;
boolean first_instruction = TRUE;
+   boolean epilog_emitted = FALSE;
+   int cond_stack = 0;
+   int call_stack = 0;
 
/* input shader */
struct tgsi_parse_context parse;
@@ -166,22 +169,66 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
  {
 struct tgsi_full_instruction *fullinst
= &parse.FullToken.FullInstruction;
+unsigned opcode = fullinst->Instruction.Opcode;
 
 if (first_instruction && ctx->prolog) {
ctx->prolog(ctx);
 }
 
-/* XXX Note: we may also want to look for a main/top-level
- * TGSI_OPCODE_RET instruction in the future.
+/*
+ * XXX Note: we handle the case of ret in main.
+ * However, the output redirections done by transform
+ * have their limits with control flow and will generally
+ * not work correctly. e.g.
+ * if (cond) {
+ *oColor = x;
+ *ret;
+ * }
+ * oColor = y;
+ * end;
+ * If the color output is redirected to a temp and modified
+ * by a transform, this will not work (the oColor assignment
+ * in the conditional will never make it to the actual output).
  */
-if (fullinst->Instruction.Opcode == TGSI_OPCODE_END
-&& ctx->epilog) {
-   /* Emit caller's epilog */
-   ctx->epilog(ctx);
-   /* Emit END */
+if ((opcode == TGSI_OPCODE_END || opcode == TGSI_OPCODE_RET) &&
+ call_stack == 0 && ctx->epilog && !epilog_emitted) {
+   if (opcode == TGSI_OPCODE_RET && cond_stack != 0) {
+  assert(!"transform ignoring RET in main");
+   } else {
+  assert(cond_stack == 0);
+  /* Emit caller's epilog */
+  ctx->epilog(ctx);
+  epilog_emitted = TRUE;
+   }
+   /* Emit END (or RET) */
ctx->emit_instruction(ctx, fullinst);
 }
 else {
+   switch (opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_BGNLOOP:
+  cond_stack++;
+  break;
+   case TGSI_OPCODE_CAL:
+  call_stack++;
+  break;
+   case TGSI_OPCODE_ENDIF:
+   case TGSI_OPCODE_ENDSWITCH:
+   case TGSI_OPCODE_ENDLOOP:
+  assert(cond_stack > 0);
+  cond_stack--;
+  break;
+   case TGSI_OPCODE_ENDSUB:
+  assert(call_stack > 0);
+  call_stack--;
+  break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_RET:
+   default:
+  break;
+   }
if (ctx->transform_instruction)
   ctx->transform_instruction(ctx, fullinst);
else
@@ -231,6 +278,7 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
  assert( 0 );
   }
}
+   assert(call_stack == 0);
 
tgsi_parse_free (&parse);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] tgsi: Recognize RET in main for tgsi_transform

2018-02-12 Thread sroland
From: Roland Scheidegger 

Shaders coming from dx10 state trackers have a RET before the END.
And the epilog needs to be placed before the RET (otherwise it will
get ignored).
Hence figure out if a RET is in main, in this case we'll place
the epilog there rather than before the END.
(At a closer look, there actually seem to be problems with control
flow in general with output redirection, that would need another
look. It's enough however to fix draw's aa line emulation in some
internal bug - lines tend to be drawn with trivial shaders, moving
either a constant color or a vertex color directly to the output).
---
 src/gallium/auxiliary/tgsi/tgsi_transform.c | 50 ++---
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c 
b/src/gallium/auxiliary/tgsi/tgsi_transform.c
index ffdad13..94d872c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -110,6 +110,8 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
 {
uint procType;
boolean first_instruction = TRUE;
+   boolean epilog_emitted = FALSE;
+   int stack_size = 0;
 
/* input shader */
struct tgsi_parse_context parse;
@@ -166,22 +168,60 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
  {
 struct tgsi_full_instruction *fullinst
= &parse.FullToken.FullInstruction;
+unsigned opcode = fullinst->Instruction.Opcode;
 
 if (first_instruction && ctx->prolog) {
ctx->prolog(ctx);
 }
 
-/* XXX Note: we may also want to look for a main/top-level
- * TGSI_OPCODE_RET instruction in the future.
+/*
+ * XXX Note: we handle the case of ret in main.
+ * However, the output redirections done by transform
+ * have their limits with control flow and will generally
+ * not work correctly. e.g.
+ * if (cond) {
+ *oColor = x;
+ *ret;
+ * }
+ * oColor = y;
+ * end;
+ * If the color output is redirected to a temp and modified
+ * by a transform, this will not work (the oColor assignment
+ * in the conditional will never make it to the actual output).
  */
-if (fullinst->Instruction.Opcode == TGSI_OPCODE_END
-&& ctx->epilog) {
+if ((opcode == TGSI_OPCODE_END ||
+ (opcode == TGSI_OPCODE_RET && stack_size == 0))
+&& ctx->epilog && !epilog_emitted) {
/* Emit caller's epilog */
ctx->epilog(ctx);
-   /* Emit END */
+   epilog_emitted = TRUE;
+   /* Emit END (or RET) */
+   if (opcode == TGSI_OPCODE_END) {
+  assert(stack_size == 0);
+   }
ctx->emit_instruction(ctx, fullinst);
 }
 else {
+   switch (opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_BGNLOOP:
+   case TGSI_OPCODE_CAL:
+  stack_size++;
+  break;
+   case TGSI_OPCODE_ENDIF:
+   case TGSI_OPCODE_ENDSWITCH:
+   case TGSI_OPCODE_ENDLOOP:
+   case TGSI_OPCODE_ENDSUB:
+  assert(stack_size > 0);
+  stack_size--;
+  break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_RET:
+   default:
+  break;
+   }
if (ctx->transform_instruction)
   ctx->transform_instruction(ctx, fullinst);
else
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] u_blit, u_simple_shaders: add shader to convert from xrbias format

2018-02-06 Thread sroland
From: Roland Scheidegger 

We need this to handle some oddball dx10 format
(DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM). What you can do with this
format is very limited, hence we don't want to add it as a gallium
format (we could not express the properties of this format as
ordinary format properties neither, so like all special formats
it would need specific code for handling it in any case).
While here, also nuke the array for different shaders for different
writemasks, as it was not actually used (always full masks are
passed in for generating shaders).
---
 src/gallium/auxiliary/util/u_blit.c   | 40 +-
 src/gallium/auxiliary/util/u_blit.h   |  3 +-
 src/gallium/auxiliary/util/u_simple_shaders.c | 48 +++
 src/gallium/auxiliary/util/u_simple_shaders.h |  4 +++
 4 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_blit.c 
b/src/gallium/auxiliary/util/u_blit.c
index 3f92476..bf1dea7 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -65,7 +65,7 @@ struct blit_state
struct pipe_vertex_element velem[2];
 
void *vs;
-   void *fs[PIPE_MAX_TEXTURE_TYPES][TGSI_WRITEMASK_XYZW + 1][3];
+   void *fs[PIPE_MAX_TEXTURE_TYPES][4];
 
struct pipe_resource *vbuf;  /**< quad vertices */
unsigned vbuf_slot;
@@ -135,17 +135,15 @@ void
 util_destroy_blit(struct blit_state *ctx)
 {
struct pipe_context *pipe = ctx->pipe;
-   unsigned i, j, k;
+   unsigned i, j;
 
if (ctx->vs)
   pipe->delete_vs_state(pipe, ctx->vs);
 
for (i = 0; i < ARRAY_SIZE(ctx->fs); i++) {
   for (j = 0; j < ARRAY_SIZE(ctx->fs[i]); j++) {
- for (k = 0; k < ARRAY_SIZE(ctx->fs[i][j]); k++) {
-if (ctx->fs[i][j][k])
-   pipe->delete_fs_state(pipe, ctx->fs[i][j][k]);
- }
+ if (ctx->fs[i][j])
+pipe->delete_fs_state(pipe, ctx->fs[i][j]);
   }
}
 
@@ -159,8 +157,9 @@ util_destroy_blit(struct blit_state *ctx)
  * Helper function to set the fragment shaders.
  */
 static inline void
-set_fragment_shader(struct blit_state *ctx, uint writemask,
+set_fragment_shader(struct blit_state *ctx,
 enum pipe_format format,
+boolean src_xrbias,
 enum pipe_texture_target pipe_tex)
 {
enum tgsi_return_type stype;
@@ -177,19 +176,29 @@ set_fragment_shader(struct blit_state *ctx, uint 
writemask,
   idx = 2;
}
 
-   if (!ctx->fs[pipe_tex][writemask][idx]) {
+   if (src_xrbias) {
+  assert(stype == TGSI_RETURN_TYPE_FLOAT);
+  idx = 3;
+  if (!ctx->fs[pipe_tex][idx]) {
+ unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex, 0);
+ ctx->fs[pipe_tex][idx] =
+util_make_fragment_tex_shader_xrbias(ctx->pipe, tgsi_tex);
+  }
+   }
+
+   else if (!ctx->fs[pipe_tex][idx]) {
   unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex, 0);
 
   /* OpenGL does not allow blits from signed to unsigned integer
* or vice versa. */
-  ctx->fs[pipe_tex][writemask][idx] =
+  ctx->fs[pipe_tex][idx] =
  util_make_fragment_tex_shader_writemask(ctx->pipe, tgsi_tex,
  TGSI_INTERPOLATE_LINEAR,
- writemask,
+ TGSI_WRITEMASK_XYZW,
  stype, stype, false, false);
}
 
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][writemask][idx]);
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][idx]);
 }
 
 
@@ -491,8 +500,8 @@ util_blit_pixels(struct blit_state *ctx,
  * The sampler view's first_layer indicate the layer to use, but for
  * cube maps it must point to the first face.  Face is passed in src_face.
  *
- * The main advantage over util_blit_pixels is that it allows to specify 
swizzles in
- * pipe_sampler_view::swizzle_?.
+ * The main advantage over util_blit_pixels is that it allows to specify
+ * swizzles in pipe_sampler_view::swizzle_?.
  *
  * But there is no control over blitting Z and/or stencil.
  */
@@ -505,7 +514,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
  struct pipe_surface *dst,
  int dstX0, int dstY0,
  int dstX1, int dstY1,
- float z, uint filter)
+ float z, uint filter,
+ boolean src_xrbias)
 {
boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT;
struct pipe_framebuffer_state fb;
@@ -593,7 +603,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
cso_set_sampler_views(ctx->cso, PIPE_SHADER_FRAGMENT, 1, &src_sampler_view);
 
/* shaders */
-   set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW,
+   set_fragment_shader(ctx, src_xrbias,
src_sampler_view->format,
src_sampler_view->texture->target

[Mesa-dev] [PATCH 1/2] u_simple_shaders: fix mask handling in util_make_fragment_tex_shader_writemask

2018-02-06 Thread sroland
From: Roland Scheidegger 

The writemask handling was busted, since writing defaults to output
meant they got overwritten by the tex sampling anyway. Albeit the
affected components were undefined, so maybe with some luck it
still would have worked with some drivers - if not could as well
kill it... (This would have affected u_blitter but not u_blit since
the latter always used xyzw mask.)
---
 src/gallium/auxiliary/util/u_simple_shaders.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c 
b/src/gallium/auxiliary/util/u_simple_shaders.c
index 9679545..a301c05 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -275,7 +275,7 @@ util_make_fragment_tex_shader_writemask(struct pipe_context 
*pipe,
if (writemask != TGSI_WRITEMASK_XYZW) {
   struct ureg_src imm = ureg_imm4f( ureg, 0, 0, 0, 1 );
 
-  ureg_MOV( ureg, out, imm );
+  ureg_MOV(ureg, temp, imm);
}
 
if (tex_target == TGSI_TEXTURE_BUFFER)
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] r600: partly fix sampleMaskIn value

2018-02-04 Thread sroland
From: Roland Scheidegger 

The hw gives us coverage for pixel, not for individual fragment shader
invocations, in case execution isn't per pixel (note eg, unlike cm, actually
cannot do "real" minSampleShading, it's either per-pixel or per-fragment, but
it doesn't really make a difference here).
Also, with msaa disabled, the hw still gives us a mask corresponding to the
number of samples, where GL requires this to be 1.
Fix this up by masking the sampleMaskIn bits with the bit corresponding to
the sampleID, if we know this shader is always executed at per-sample
granularity. (In case of a per-sample frequency shader and msaa disabled, the
sampleID will always be 0, so this works just fine there.)
Fixing this for the minSampleShading case will require a shader key (radeonsi
uses the prolog part for this) (for eg, could get away with a single bit, cm
would need either more bits depending on sample/invocation ratio, or read the
bits from a uniform), unless we'd want to always use a sample mask uniform
(which is probably not a good idea, as it would make the ordinary common msaa
case slower for no good reason).
This fixes some parts of piglit arb_sample_shading-samplemask (needs fixed
test), in particular those which use a sampleID, while still failing others
as expected.
---
 src/gallium/drivers/r600/r600_shader.c | 54 ++
 1 file changed, 54 insertions(+)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 1009411c62..8779f166aa 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1138,6 +1138,11 @@ static int allocate_system_value_inputs(struct 
r600_shader_ctx *ctx, int gpr_off
 
tgsi_parse_free(&parse);
 
+   if (ctx->info.reads_samplemask &&
+   (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) {
+   inputs[1].enabled = true;
+   }
+
if (ctx->bc->chip_class >= EVERGREEN) {
int num_baryc = 0;
/* assign gpr to each interpolator according to priority */
@@ -3503,8 +3508,57 @@ static int r600_shader_from_tgsi(struct r600_context 
*rctx,
r = eg_load_helper_invocation(&ctx);
if (r)
return r;
+   }
+
+   /*
+* XXX this relies on fixed_pt_position_gpr only being present when
+* this shader should be executed per sample. Should be the case for 
now...
+*/
+   if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
+   /*
+* Fix up sample mask. The hw always gives us coverage mask for
+* the pixel. However, for per-sample shading, we need the
+* coverage for the shader invocation only.
+* Also, with disabled msaa, only the first bit should be set
+* (luckily the same fixup works for both problems).
+* For now, we can only do it if we know this shader is always
+* executed per sample (due to usage of bits in the shader
+* forcing per-sample execution).
+* If the fb is not multisampled, we'd do unnecessary work but
+* it should still be correct.
+* It will however do nothing for sample shading according
+* to MinSampleShading.
+*/
+   struct r600_bytecode_alu alu;
+   int tmp = r600_get_temp(&ctx);
+   assert(ctx.face_gpr != -1);
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+
+   alu.op = ALU_OP2_LSHL_INT;
+   alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+   alu.src[0].value = 0x1;
+   alu.src[1].sel = ctx.fixed_pt_position_gpr;
+   alu.src[1].chan = 3;
+   alu.dst.sel = tmp;
+   alu.dst.chan = 0;
+   alu.dst.write = 1;
+   alu.last = 1;
+   if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
+   return r;
 
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP2_AND_INT;
+   alu.src[0].sel = tmp;
+   alu.src[1].sel = ctx.face_gpr;
+   alu.src[1].chan = 2;
+   alu.dst.sel = ctx.face_gpr;
+   alu.dst.chan = 2;
+   alu.dst.write = 1;
+   alu.last = 1;
+   if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
+   return r;
}
+
if (ctx.fragcoord_input >= 0) {
if (ctx.bc->chip_class == CAYMAN) {
for (j = 0 ; j < 4; j++) {
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] r600/cm: (trivial) code cleanup for emitting msaa state

2018-02-04 Thread sroland
From: Roland Scheidegger 

No functional change (compile tested only).
---
 src/gallium/drivers/r600/cayman_msaa.c  | 14 ++
 src/gallium/drivers/r600/evergreen_state.c  | 10 ++
 src/gallium/drivers/r600/r600_pipe_common.h |  6 ++
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/r600/cayman_msaa.c 
b/src/gallium/drivers/r600/cayman_msaa.c
index 6bc307a4bc..f97924ac22 100644
--- a/src/gallium/drivers/r600/cayman_msaa.c
+++ b/src/gallium/drivers/r600/cayman_msaa.c
@@ -141,7 +141,7 @@ void cayman_init_msaa(struct pipe_context *ctx)
cayman_get_sample_position(ctx, 16, i, 
rctx->sample_locations_16x[i]);
 }
 
-void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples)
+static void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int 
nr_samples)
 {
switch (nr_samples) {
default:
@@ -202,9 +202,8 @@ void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs 
*cs, int nr_samples)
}
 }
 
-void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
-int ps_iter_samples, int overrast_samples,
-unsigned sc_mode_cntl_1)
+void cayman_emit_msaa_state(struct radeon_winsys_cs *cs, int nr_samples,
+   int ps_iter_samples, int overrast_samples)
 {
int setup_samples = nr_samples > 1 ? nr_samples :
overrast_samples > 1 ? overrast_samples : 0;
@@ -216,6 +215,13 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, 
int nr_samples,
 *   endcaps.
 */
unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+   unsigned sc_mode_cntl_1 =
+   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+
+   if (nr_samples > 1) {
+   cayman_emit_msaa_sample_locs(cs, nr_samples);
+   }
 
if (setup_samples > 1) {
/* indexed by log2(nr_samples) */
diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 385d017840..9620fa9e7a 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1948,14 +1948,8 @@ static void evergreen_emit_framebuffer_state(struct 
r600_context *rctx, struct r
if (rctx->b.chip_class == EVERGREEN) {
evergreen_emit_msaa_state(rctx, rctx->framebuffer.nr_samples, 
rctx->ps_iter_samples);
} else {
-   unsigned sc_mode_cntl_1 =
-   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
-   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1);
-
-   if (rctx->framebuffer.nr_samples > 1)
-   cayman_emit_msaa_sample_locs(cs, 
rctx->framebuffer.nr_samples);
-   cayman_emit_msaa_config(cs, rctx->framebuffer.nr_samples,
-   rctx->ps_iter_samples, 0, 
sc_mode_cntl_1);
+   cayman_emit_msaa_state(cs, rctx->framebuffer.nr_samples,
+  rctx->ps_iter_samples, 0);
}
 }
 
diff --git a/src/gallium/drivers/r600/r600_pipe_common.h 
b/src/gallium/drivers/r600/r600_pipe_common.h
index 86a20f8639..ee8eb54920 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.h
+++ b/src/gallium/drivers/r600/r600_pipe_common.h
@@ -799,10 +799,8 @@ extern const unsigned eg_max_dist_4x;
 void cayman_get_sample_position(struct pipe_context *ctx, unsigned 
sample_count,
unsigned sample_index, float *out_value);
 void cayman_init_msaa(struct pipe_context *ctx);
-void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples);
-void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
-int ps_iter_samples, int overrast_samples,
-unsigned sc_mode_cntl_1);
+void cayman_emit_msaa_state(struct radeon_winsys_cs *cs, int nr_samples,
+   int ps_iter_samples, int overrast_samples);
 
 
 /* Inline helpers. */
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] r600: clean up fragment shader input scan code

2018-02-04 Thread sroland
From: Roland Scheidegger 

For some reason, we were iterating through the code twice (first just for
instructions needing barycentrics, then for instructions and input dcls).
Move things around slightly so this is no longer necessary.
There also was a unnedeed enabling of the fixed_pt_position_gpr - this is only
needed if the per-sample interpolation comes from an input, not from an
instruction (just move the assert where it belongs) (since the sample id to
sample from comes from a tgsi src in this case, and isn't sampleID).
Otherwise there should be no functional change.
---
 src/gallium/drivers/r600/r600_shader.c | 75 +++---
 1 file changed, 23 insertions(+), 52 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 13aa681049..1009411c62 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -,7 +,6 @@ static int allocate_system_value_inputs(struct 
r600_shader_ctx *ctx, int gpr_off
 
if (inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_SAMPLE) {
location = TGSI_INTERPOLATE_LOC_CENTER;
-   inputs[1].enabled = true; /* needs 
SAMPLEID */
} else if (inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_OFFSET) {
location = TGSI_INTERPOLATE_LOC_CENTER;
/* Needs sample positions, currently 
those are always available */
@@ -1139,6 +1138,19 @@ static int allocate_system_value_inputs(struct 
r600_shader_ctx *ctx, int gpr_off
 
tgsi_parse_free(&parse);
 
+   if (ctx->bc->chip_class >= EVERGREEN) {
+   int num_baryc = 0;
+   /* assign gpr to each interpolator according to priority */
+   for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
+   if (ctx->eg_interpolators[i].enabled) {
+   ctx->eg_interpolators[i].ij_index = num_baryc;
+   num_baryc++;
+   }
+   }
+   num_baryc = (num_baryc + 1) >> 1;
+   gpr_offset += num_baryc;
+   }
+
for (i = 0; i < ARRAY_SIZE(inputs); i++) {
boolean enabled = inputs[i].enabled;
int *reg = inputs[i].reg;
@@ -1165,18 +1177,21 @@ static int allocate_system_value_inputs(struct 
r600_shader_ctx *ctx, int gpr_off
  * for evergreen we need to scan the shader to find the number of GPRs we need 
to
  * reserve for interpolation and system values
  *
- * we need to know if we are going to emit
- * any sample or centroid inputs
+ * we need to know if we are going to emit any sample or centroid inputs
  * if perspective and linear are required
 */
 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
 {
unsigned i;
-   int num_baryc;
-   struct tgsi_parse_context parse;
 
memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
 
+   /*
+* Could get this information from the shader info. But right now
+* we interpolate all declared inputs, whereas the shader info will
+* only contain the bits if the inputs are actually used, so it might
+* not be safe...
+*/
for (i = 0; i < ctx->info.num_inputs; i++) {
int k;
/* skip position/face/mask/sampleid */
@@ -1193,53 +1208,9 @@ static int evergreen_gpr_count(struct r600_shader_ctx 
*ctx)
ctx->eg_interpolators[k].enabled = TRUE;
}
 
-   if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
-   return 0;
-   }
-
-   /* need to scan shader for system values and 
interpolateAtSample/Offset/Centroid */
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-   tgsi_parse_token(&parse);
-
-   if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
-   const struct tgsi_full_instruction *inst = 
&parse.FullToken.FullInstruction;
-   if (inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_SAMPLE ||
-   inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_OFFSET ||
-   inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_CENTROID)
-   {
-   int interpolate, location, k;
-
-   if (inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_SAMPLE) {
-   location = TGSI_INTERPOLATE_LOC_CENTER;
-   } else if (inst->Instruction.Opcode == 
TGSI_OPCODE_INTERP_OFFSET) {
-   location = TGSI_INTERPOLATE_LOC_CENTER;
-   } else {
-   location = 
TGSI_INTERPOLATE_LOC_CENTROID;
-

[Mesa-dev] [PATCH 2/4] mesa: (trivial) remove unused ignore_sample_qualifier_parameter

2018-02-04 Thread sroland
From: Roland Scheidegger 

This parameter for _mesa_get_min_incations_per_fragment() was once used
by the intel driver, but it's long gone.
---
 src/mesa/program/program.c| 11 ---
 src/mesa/program/program.h|  3 +--
 src/mesa/state_tracker/st_atom_msaa.c |  2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index 220efc3539..6aba3cb3f1 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -515,8 +515,7 @@ _mesa_find_free_register(const GLboolean used[],
  */
 GLint
 _mesa_get_min_invocations_per_fragment(struct gl_context *ctx,
-   const struct gl_program *prog,
-   bool ignore_sample_qualifier)
+   const struct gl_program *prog)
 {
/* From ARB_sample_shading specification:
 * "Using gl_SampleID in a fragment shader causes the entire shader
@@ -534,11 +533,9 @@ _mesa_get_min_invocations_per_fragment(struct gl_context 
*ctx,
* "Use of the "sample" qualifier on a fragment shader input
*  forces per-sample shading"
*/
-  if (prog->info.fs.uses_sample_qualifier && !ignore_sample_qualifier)
- return MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1);
-
-  if (prog->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
-   SYSTEM_BIT_SAMPLE_POS))
+  if (prog->info.fs.uses_sample_qualifier ||
+  (prog->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+SYSTEM_BIT_SAMPLE_POS)))
  return MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1);
   else if (ctx->Multisample.SampleShading)
  return MAX2(ceil(ctx->Multisample.MinSampleShadingValue *
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 376da7b2d4..659385f55b 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -108,8 +108,7 @@ _mesa_find_free_register(const GLboolean used[],
 
 extern GLint
 _mesa_get_min_invocations_per_fragment(struct gl_context *ctx,
-   const struct gl_program *prog,
-   bool ignore_sample_qualifier);
+   const struct gl_program *prog);
 
 static inline GLuint
 _mesa_program_enum_to_shader_stage(GLenum v)
diff --git a/src/mesa/state_tracker/st_atom_msaa.c 
b/src/mesa/state_tracker/st_atom_msaa.c
index 589e328ac5..556c7c5889 100644
--- a/src/mesa/state_tracker/st_atom_msaa.c
+++ b/src/mesa/state_tracker/st_atom_msaa.c
@@ -77,5 +77,5 @@ st_update_sample_shading(struct st_context *st)
   return;
 
cso_set_min_samples(st->cso_context,
- _mesa_get_min_invocations_per_fragment(st->ctx, &st->fp->Base, 
false));
+ _mesa_get_min_invocations_per_fragment(st->ctx, &st->fp->Base));
 }
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600: don't do stack workarounds for hemlock

2018-01-29 Thread sroland
From: Roland Scheidegger 

By the looks of it it seems hemlock is treated separately to cypress, but
certainly it won't need the stack workarounds cedar/redwood (and
seemingly every other eg chip except cypress/juniper) need.
(Discovered by accident.)
---
 src/gallium/drivers/r600/sb/sb_bc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/r600/sb/sb_bc.h 
b/src/gallium/drivers/r600/sb/sb_bc.h
index b35671bf0f..a249395474 100644
--- a/src/gallium/drivers/r600/sb/sb_bc.h
+++ b/src/gallium/drivers/r600/sb/sb_bc.h
@@ -665,6 +665,7 @@ public:
return false;
 
switch (hw_chip) {
+   case HW_CHIP_HEMLOCK:
case HW_CHIP_CYPRESS:
case HW_CHIP_JUNIPER:
return false;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] mesa: skip validation of legality of size/type queries for format queries

2018-01-26 Thread sroland
From: Roland Scheidegger 

The size/type query is always legal (if we made it that far).
This causes a difference for GL_TEXTURE_BUFFER - the reason is that these
parameters are valid only with GetTexLevelParameter() if gl 3.1 is supported,
but not if only ARB_texture_buffer_object is supported.
However, while the spec says that these queries return "the same information
as querying GetTexLevelParameter" I believe we're not expected to return just
zeros here. By definition, these pnames are always valid (unlike for the
GetTexLevelParameter() function which would return an error without GL 3.1),
so returning 0 but no error makes no sense to me.

This breaks some piglit arb_internalformat_query2 tests (which I belive to
be wrong).
---
 src/mesa/main/formatquery.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 2214f97e67..f345140518 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -960,9 +960,6 @@ _mesa_GetInternalformativ(GLenum target, GLenum 
internalformat, GLenum pname,
   mesa_format texformat;
 
   if (target != GL_RENDERBUFFER) {
- if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, true))
-goto end;
-
  baseformat = _mesa_base_tex_format(ctx, internalformat);
   } else {
  baseformat = _mesa_base_fbo_format(ctx, internalformat);
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] mesa: restrict formats being supported by target type for formatquery

2018-01-26 Thread sroland
From: Roland Scheidegger 

The code just considered all formats as being supported if they were either
a valid fbo or texture format.
This was quite awkward since then the query would return "supported" for
e.g. GL_RGB9E5 or compressed formats and target RENDERBUFFER (albeit the driver
could still refuse it in theory). However, when then querying for instance the
internalformat sizes, it would just return 0 (due to the checks being more
strict there).
It was also a problem for texture buffer targets, which have a more restricted
list of formats which are allowed (and again, it would return supported but
then querying sizes would return 0).
So only take validation of formats into account which make sense for a given
target.
Can also toss out some special checks for rgb9e5 later, since we'd never get
there if it wasn't supported in the first place.
---
 src/mesa/main/formatquery.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index e0062a64d2..2214f97e67 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -558,15 +558,29 @@ _is_internalformat_supported(struct gl_context *ctx, 
GLenum target,
 * implementation accepts it for any texture specification 
commands, and
 * - unsized or base internal format, if the implementation accepts
 * it for texture or image specification.
+*
+* But also:
+* "If the particualar  and  combination do not make
+* sense, or if a particular type of  is not supported by the
+* implementation the "unsupported" answer should be given. This is not an
+* error.
 */
GLint buffer[1];
 
-   /* At this point an internalformat is valid if it is valid as a texture or
-* as a renderbuffer format. The checks are different because those methods
-* return different values when passing non supported internalformats */
-   if (_mesa_base_tex_format(ctx, internalformat) < 0 &&
-   _mesa_base_fbo_format(ctx, internalformat) == 0)
-  return false;
+   if (target == GL_RENDERBUFFER) {
+  if (_mesa_base_fbo_format(ctx, internalformat) == 0) {
+ return false;
+  }
+   } else if (target == GL_TEXTURE_BUFFER) {
+  if (_mesa_validate_texbuffer_format(ctx, internalformat) ==
+  MESA_FORMAT_NONE) {
+ return false;
+  }
+   } else {
+  if (_mesa_base_tex_format(ctx, internalformat) < 0) {
+ return false;
+  }
+   }
 
/* Let the driver have the final word */
ctx->Driver.QueryInternalFormat(ctx, target, internalformat,
@@ -969,10 +983,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum 
internalformat, GLenum pname,
* and glGetRenderbufferParameteriv functions.
*/
   if (pname == GL_INTERNALFORMAT_SHARED_SIZE) {
- if (_mesa_has_EXT_texture_shared_exponent(ctx) &&
- target != GL_TEXTURE_BUFFER &&
- target != GL_RENDERBUFFER &&
- texformat == MESA_FORMAT_R9G9B9E5_FLOAT) {
+ if (texformat == MESA_FORMAT_R9G9B9E5_FLOAT) {
 buffer[0] = 5;
  }
  goto end;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] mesa: remove misleading gles checks for formatquery

2018-01-26 Thread sroland
From: Roland Scheidegger 

Testing for gles there is just confusing - this is about target being
supported, if it was valid at all was already determined earlier
(in _legal_parameters). It didn't make sense at all in any case, since
it would only have said false there for gles for 2d but not 2d arrays etc.
---
 src/mesa/main/formatquery.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 61f798c88f..e0062a64d2 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -392,14 +392,12 @@ _is_target_supported(struct gl_context *ctx, GLenum 
target)
 * implementation the "unsupported" answer should be given.
 * This is not an error."
 *
-* For OpenGL ES, queries can only be used with GL_RENDERBUFFER or MS.
+* Note that legality of targets has already been verified.
 */
switch(target){
case GL_TEXTURE_1D:
case GL_TEXTURE_2D:
case GL_TEXTURE_3D:
-  if (!_mesa_is_desktop_gl(ctx))
- return false;
   break;
 
case GL_TEXTURE_1D_ARRAY:
@@ -702,6 +700,12 @@ _mesa_query_internal_format_default(struct gl_context 
*ctx, GLenum target,
case GL_FRAMEBUFFER_RENDERABLE_LAYERED:
case GL_FRAMEBUFFER_BLEND:
case GL_FILTER:
+  /*
+   * XXX seems a tad optimistic just saying yes to everything here.
+   * Even for combinations which make no sense...
+   * And things like TESS_CONTROL_TEXTURE should definitely default to
+   * NONE if the driver doesn't even support tessellation...
+   */
   params[0] = GL_FULL_SUPPORT;
   break;
case GL_NUM_TILING_TYPES_EXT:
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix crash with seamless cube filtering with different min/mag filter

2018-01-24 Thread sroland
From: Roland Scheidegger 

We are not allowed to modify the incoming coords values, or things may
crash (as we may be inside a llvm conditional and the values may be used
in another branch).
I recently broke this when fixing an issue with NaNs and seamless cube
map filtering, and it causes crashes when doing cubemap filtering
if the min and mag filters are different.
Add const to the pointers passed in to prevent this mishap in the future.

Fixes: a485ad0bcd ("gallivm: fix an issue with NaNs with seamless cube 
filtering")
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 38 +--
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index ff8cbf6..8f760f5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -857,7 +857,7 @@ lp_build_sample_image_nearest(struct 
lp_build_sample_context *bld,
   LLVMValueRef img_stride_vec,
   LLVMValueRef data_ptr,
   LLVMValueRef mipoffsets,
-  LLVMValueRef *coords,
+  const LLVMValueRef *coords,
   const LLVMValueRef *offsets,
   LLVMValueRef colors_out[4])
 {
@@ -1004,7 +1004,7 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
  LLVMValueRef img_stride_vec,
  LLVMValueRef data_ptr,
  LLVMValueRef mipoffsets,
- LLVMValueRef *coords,
+ const LLVMValueRef *coords,
  const LLVMValueRef *offsets,
  LLVMValueRef colors_out[4])
 {
@@ -1106,7 +1106,7 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
   struct lp_build_if_state edge_if;
   LLVMTypeRef int1t;
   LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
-  LLVMValueRef coord, have_edge, have_corner;
+  LLVMValueRef coord0, coord1, have_edge, have_corner;
   LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, 
fall_off_y;
   LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
   LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
@@ -1130,20 +1130,20 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
* other values might be bogus in the end too).
* So kill off the NaNs here.
*/
-  coords[0] = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
-   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-  coords[1] = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
-   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-  coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
+  coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
+GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+  coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
   /* instead of clamp, build mask if overflowed */
-  coord = lp_build_sub(coord_bld, coord, half);
+  coord0 = lp_build_sub(coord_bld, coord0, half);
   /* convert to int, compute lerp weight */
   /* not ideal with AVX (and no AVX2) */
-  lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
+  lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
   x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
-  coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
-  coord = lp_build_sub(coord_bld, coord, half);
-  lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
+  coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
+GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+  coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
+  coord1 = lp_build_sub(coord_bld, coord1, half);
+  lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
   y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
 
   fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
@@ -1747,7 +1747,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context 
*bld,
unsigned img_filter,
unsigned mip_filter,
boolean is_gather,
-   LLVMValueRef *coords,
+   const LLVMValueRef *coords,
const LLVMValueRef *offsets,
LLVMValueRef ilevel0,
LLVMValueRef ilevel1,
@@ -1820,6 +1820,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context 
*bld,
   PIPE_FUNC_GREATER,
   lod_fpart, bld->lodf_bld.zero);
  need_lerp =

[Mesa-dev] [PATCH] r600: increase number of samplers/views from 16 to 18 on eg

2018-01-22 Thread sroland
From: Roland Scheidegger 

Some apps are known to require more than 16. Albeit they probably still won't
run with 18 (since all new hw/drivers support 32) it shouldn't hurt to at
least support 18 (seemingly the hw limit on all r600-ni chips - the blob also
supports 18, at least for eg+ by the looks of it).

Unfortunately border colors do not work for the last 2 units. The reg guide
says there is a 5 bit index for setting border colors, but this is a lie.
piglit max-samplers shows that indeed setting border color for units 16/17
(per stage) will simply overwrite the border color for units 0/1, and sampling
will consequently also use those border color values for sampling on units
16/17. (For eg - no idea about ni.)
This will cause piglit max-samplers border to fail, but meh... border colors
are more or less totally busted (sampler swizzling...) on that hw anyway.
Border colors should still work if not both units 0 and 16 (or units 1 and 17)
use a border color simultaneously.

Setting border color values on r600/r700 is different, and I have no idea
if the hw would also wrap-around when trying to use border colors or do
something crazy (like locking up...) so don't increase the limit there (since
the blob doesn't do it I'm not sure if it would be safe).
---
 src/gallium/drivers/r600/evergreen_state.c   | 7 +++
 src/gallium/drivers/r600/r600_pipe.c | 6 +-
 src/gallium/drivers/r600/r600_pipe.h | 8 
 src/gallium/drivers/r600/r600_state_common.c | 2 +-
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index fb1de9cbf4..55a460053c 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2372,6 +2372,13 @@ static void evergreen_emit_sampler_states(struct 
r600_context *rctx,
radeon_emit(cs, (resource_id_base + i) * 3);
radeon_emit_array(cs, rstate->tex_sampler_words, 3);
 
+   /*
+* Note for sampler 16/17 this will overwrite border color
+* on sampler 0/1. As long as border color isn't used on
+* both units 0 and 16 (or 1 and 17) it should actually work
+* since the sampler also appears to remap those border color
+* values the same way.
+*/
if (rstate->border_color_use) {
radeon_set_config_reg_seq(cs, border_index_reg, 5);
radeon_emit(cs, i);
diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index 95aa2e5383..7f9500ad4b 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -595,7 +595,11 @@ static int r600_get_shader_param(struct pipe_screen* 
pscreen,
return 1;
case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
-   return 16;
+   /*
+* There is potentially even more trouble with border colors
+* for units 16/17 on r600/r700, so only enable 18 on eg+
+*/
+   return rscreen->b.family >= CHIP_CEDAR ? R600_NUM_TEX_UNITS : 
16;
 case PIPE_SHADER_CAP_PREFERRED_IR:
if (shader == PIPE_SHADER_COMPUTE) {
return PIPE_SHADER_IR_NATIVE;
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index 112b5cbb83..e2bd7b0a99 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -366,7 +366,7 @@ struct r600_pipe_sampler_state {
 };
 
 /* needed for blitter save */
-#define NUM_TEX_UNITS 16
+#define R600_NUM_TEX_UNITS 18
 
 struct r600_seamless_cube_map {
struct r600_atomatom;
@@ -375,7 +375,7 @@ struct r600_seamless_cube_map {
 
 struct r600_samplerview_state {
struct r600_atomatom;
-   struct r600_pipe_sampler_view   *views[NUM_TEX_UNITS];
+   struct r600_pipe_sampler_view   *views[R600_NUM_TEX_UNITS];
uint32_tenabled_mask;
uint32_tdirty_mask;
uint32_tcompressed_depthtex_mask; /* which 
textures are depth */
@@ -385,7 +385,7 @@ struct r600_samplerview_state {
 
 struct r600_sampler_states {
struct r600_atomatom;
-   struct r600_pipe_sampler_state  *states[NUM_TEX_UNITS];
+   struct r600_pipe_sampler_state  *states[R600_NUM_TEX_UNITS];
uint32_tenabled_mask;
uint32_tdirty_mask;
uint32_thas_bordercolor_mask; /* which states 
contain the border color */
@@ -394,7 +394,7 @@ struct r600_sampler_states {
 struct r600_textures_info {
struct r600_samplerview_state   views;
struct r600_sampler_states  states;
-   

[Mesa-dev] [PATCH] draw: remove VSPLIT_CREATE_IDX macro

2018-01-16 Thread sroland
From: Roland Scheidegger 

Just inline the little bit of code.
---
 src/gallium/auxiliary/draw/draw_pt_vsplit.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c 
b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
index 3ff077b..653deab 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -116,21 +116,15 @@ vsplit_get_base_idx(unsigned start, unsigned fetch)
return draw_overflow_uadd(start, fetch, MAX_ELT_IDX);
 }
 
-/*
- * The final element index is just element index plus element bias.
- */
-#define VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias)\
-   unsigned elt_idx;   \
-   elt_idx = vsplit_get_base_idx(start, fetch);\
-   elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + (int)elt_bias);
-
 
 static inline void
 vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
unsigned start, unsigned fetch, int elt_bias)
 {
struct draw_context *draw = vsplit->draw;
-   VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
+   unsigned elt_idx;
+   elt_idx = vsplit_get_base_idx(start, fetch);
+   elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + elt_bias);
/* unlike the uint case this can only happen with elt_bias */
if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && 
!vsplit->cache.has_max_fetch) {
   unsigned hash = elt_idx % MAP_SIZE;
@@ -145,7 +139,9 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, 
const ushort *elts,
unsigned start, unsigned fetch, int elt_bias)
 {
struct draw_context *draw = vsplit->draw;
-   VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
+   unsigned elt_idx;
+   elt_idx = vsplit_get_base_idx(start, fetch);
+   elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + elt_bias);
/* unlike the uint case this can only happen with elt_bias */
if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && 
!vsplit->cache.has_max_fetch) {
   unsigned hash = elt_idx % MAP_SIZE;
@@ -165,7 +161,12 @@ vsplit_add_cache_uint(struct vsplit_frontend *vsplit, 
const uint *elts,
   unsigned start, unsigned fetch, int elt_bias)
 {
struct draw_context *draw = vsplit->draw;
-   VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
+   unsigned elt_idx;
+   /*
+* The final element index is just element index plus element bias.
+*/
+   elt_idx = vsplit_get_base_idx(start, fetch);
+   elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + elt_bias);
/* Take care for DRAW_MAX_FETCH_IDX (since cache is initialized to -1). */
if (elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) {
   unsigned hash = elt_idx % MAP_SIZE;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: fix vsplit code when the (post-bias) index value is -1

2018-01-15 Thread sroland
From: Roland Scheidegger 

vsplit_add_cache uses the post-bias index for hashing, but the
vsplit_add_cache_uint/ushort/ubyte ones used the pre-bias index, therefore
the code for handling the special case (because -1 matches the initialization
value of the cache) wasn't actually working.
Commit 78a997f72841310620d18daa9015633343d04db1 actually simplified the
cache logic somewhat, but it looks like this particular problem carried over
(and duplicated to the ushort/ubyte cases, since before only uint needed it).
This could lead to the vsplit cache doing the wrong thing, in particular
later fetch_info might indicate there are 0 values to fetch. This only really
affected edge cases which were bogus to begin with, but it could lead to a
crash with the jit vertex shader, since it cannot handle this case correctly
(the count loop is always executed at least once and we would not allocate
any memory for the shader outputs), so add another assert to catch it there.
---
 src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c | 1 +
 src/gallium/auxiliary/draw/draw_pt_vsplit.c| 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c 
b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index c6492a1..5e0c562 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -368,6 +368,7 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle,
unsigned start_or_maxelt, vid_base;
const unsigned *elts;
 
+   assert(fetch_info->count > 0);
llvm_vert_info.count = fetch_info->count;
llvm_vert_info.vertex_size = fpme->vertex_size;
llvm_vert_info.stride = fpme->vertex_size;
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c 
b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
index a68d5bf..3ff077b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -133,7 +133,7 @@ vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, 
const ubyte *elts,
VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
/* unlike the uint case this can only happen with elt_bias */
if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && 
!vsplit->cache.has_max_fetch) {
-  unsigned hash = fetch % MAP_SIZE;
+  unsigned hash = elt_idx % MAP_SIZE;
   vsplit->cache.fetches[hash] = 0;
   vsplit->cache.has_max_fetch = TRUE;
}
@@ -148,7 +148,7 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, 
const ushort *elts,
VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
/* unlike the uint case this can only happen with elt_bias */
if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && 
!vsplit->cache.has_max_fetch) {
-  unsigned hash = fetch % MAP_SIZE;
+  unsigned hash = elt_idx % MAP_SIZE;
   vsplit->cache.fetches[hash] = 0;
   vsplit->cache.has_max_fetch = TRUE;
}
@@ -168,7 +168,7 @@ vsplit_add_cache_uint(struct vsplit_frontend *vsplit, const 
uint *elts,
VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
/* Take care for DRAW_MAX_FETCH_IDX (since cache is initialized to -1). */
if (elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) {
-  unsigned hash = fetch % MAP_SIZE;
+  unsigned hash = elt_idx % MAP_SIZE;
   /* force update - any value will do except DRAW_MAX_FETCH_IDX */
   vsplit->cache.fetches[hash] = 0;
   vsplit->cache.has_max_fetch = TRUE;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600: fix relocs for PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE query

2018-01-11 Thread sroland
From: Roland Scheidegger 

The command parser is very sad if we don't emit the relocs per hw query...

However, don't enable it. It mostly works, but piglit
arb_transform_feedback_overflow_query-basic shows 2 failures (it's really the
same case for the hw), conditional_render_any and conditional_render_single.
By some experimentation, it looks like the firmware combines the values wrongly
for the non-inverted (i.e. hw-inverted) case - it will only not draw if all
4 streams overflow, rather than just at least one.
Interestingly, radeonsi has a workaround for some VI firmware which looks like
it was the exact same firmware bug. Hence, looks like it would need new
firmware to properly fix this.
(Tested on Juniper, not sure if firmware for all chips is broken.)
---
 src/gallium/drivers/r600/r600_query.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_query.c 
b/src/gallium/drivers/r600/r600_query.c
index b4519830cc..5ff0570308 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -742,9 +742,12 @@ static void r600_query_hw_do_emit_start(struct 
r600_common_context *ctx,
emit_sample_streamout(cs, va, query->stream);
break;
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
-   for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
+   for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
emit_sample_streamout(cs, va + 32 * stream, stream);
-   break;
+   r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf,
+   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+   }
+   return;
case PIPE_QUERY_TIME_ELAPSED:
/* Write the timestamp after the last draw is done.
 * (bottom-of-pipe)
@@ -827,9 +830,12 @@ static void r600_query_hw_do_emit_stop(struct 
r600_common_context *ctx,
break;
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
va += 16;
-   for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
+   for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
emit_sample_streamout(cs, va + 32 * stream, stream);
-   break;
+   r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf,
+   RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
+   }
+   return;
case PIPE_QUERY_TIME_ELAPSED:
va += 8;
/* fall through */
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: require at least 14 UBOs for GL 4.3

2018-01-10 Thread sroland
From: Roland Scheidegger 

ARB_ubo requires 12 UBOs (per stage) at least, but this limit has been
raised by GL 4.3 to 14, so don't advertize GL 4.3 without it (only checking
the vertex stage since all drivers probably have the same limit anyway for
other stages). (piglit has minmax tests for that kind of thing, but they go
only up to 3.3, so this won't really be noticed.)
I think this currently should not affect any driver - r600 until very
recently only supported 12 but now advertizes 14 too.
---
 src/mesa/main/version.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 90c5c5f..68079f4 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -352,6 +352,7 @@ compute_version(const struct gl_extensions *extensions,
  extensions->ARB_transform_feedback_instanced);
const bool ver_4_3 = (ver_4_2 &&
  consts->GLSLVersion >= 430 &&
+ consts->Program[MESA_SHADER_VERTEX].MaxUniformBlocks 
>= 14 &&
  extensions->ARB_ES3_compatibility &&
  extensions->ARB_arrays_of_arrays &&
  extensions->ARB_compute_shader &&
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] util: fix NORETURN for msvc, add HAVE_FUNC_ATTRIBUTE_NORETURN to c99_compat.h

2018-01-09 Thread sroland
From: Roland Scheidegger 

We've seen some problems internally due to macro redefinition.
Fix this by adding HAVE_FUNC_ATTRIBUTE_NORETURN to c99_compat.h,
and defining it for msvc.
And avoid redefinition just in case.
---
 include/c99_compat.h |  1 +
 src/util/macros.h| 12 
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/c99_compat.h b/include/c99_compat.h
index cb690c6..81621a7 100644
--- a/include/c99_compat.h
+++ b/include/c99_compat.h
@@ -164,6 +164,7 @@ test_c99_compat_h(const void * restrict a,
 #define HAVE_FUNC_ATTRIBUTE_FORMAT 1
 #define HAVE_FUNC_ATTRIBUTE_PACKED 1
 #define HAVE_FUNC_ATTRIBUTE_ALIAS 1
+#define HAVE_FUNC_ATTRIBUTE_NORETURN 1
 
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
/* https://gcc.gnu.org/onlinedocs/gcc-4.3.6/gcc/Other-Builtins.html */
diff --git a/src/util/macros.h b/src/util/macros.h
index 2a08407..5ce0e57 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -171,10 +171,14 @@ do {   \
 #define ATTRIBUTE_RETURNS_NONNULL
 #endif
 
-#ifdef HAVE_FUNC_ATTRIBUTE_NORETURN
-#define NORETURN __attribute__((__noreturn__))
-#else
-#define NORETURN
+#ifndef NORETURN
+#  ifdef _MSC_VER
+#define NORETURN __declspec(noreturn)
+#  elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
+#define NORETURN __attribute__((__noreturn__))
+#  else
+#define NORETURN
+#  endif
 #endif
 
 #ifdef __cplusplus
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] r600: fix enabled_rb_mask on eg/cm

2018-01-08 Thread sroland
From: Roland Scheidegger 

For eg/cm, the r600_gb_backend_map will always be 0. I assume this is a bug
in the drm kernel driver, as it just just never fills the information in.
I am not entirely sure if the map is supposed to be needed for these chips,
since unlike on r600/r700 the value calculated for the map is in fact written
to the GB_BACKEND_MAP reg, for which I am unable to dig up any documentation.

In any case, this causes r600_query_hw_prepare_buffer to write the "status bit"
(just the highest bit of the occlusion query result) even for active rbes
(all but the first). This doesn't make much sense, albeit I suppose it's mostly
safe. According to the commit history, it's necessary to set these bits for
inactive rbes since otherwise predication will lock up - presumably the hw just
is waiting for the status bit to appear, which will never happen with inactive
rbes. I'd guess potentially predication could be wrong (due to not waiting for
the actual result if the status bit is already there) if this is set for
active rbes.

Discovered while trying to fix predication lockups on Juniper (needs another
patch).
---
 src/gallium/drivers/r600/r600_query.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_query.c 
b/src/gallium/drivers/r600/r600_query.c
index 987da9a806..699404b10d 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -1834,8 +1834,14 @@ void r600_query_fix_enabled_rb_mask(struct 
r600_common_screen *rscreen)
 
assert(rscreen->chip_class <= CAYMAN);
 
-   /* if backend_map query is supported by the kernel */
-   if (rscreen->info.r600_gb_backend_map_valid) {
+   /*
+* if backend_map query is supported by the kernel.
+* Note the kernel drm driver (as of now) never fills in the associated
+* data on eg/cm, only r600/r700, hence ignore the valid bit there.
+* (Albeit some chips with just one active rb can have a valid 0 map.)
+*/ 
+   if (rscreen->info.r600_gb_backend_map_valid &&
+   (ctx->chip_class < EVERGREEN || rscreen->info.r600_gb_backend_map 
!= 0)) {
unsigned num_tile_pipes = rscreen->info.num_tile_pipes;
unsigned backend_map = rscreen->info.r600_gb_backend_map;
unsigned item_width, item_mask;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] r600: hack up num_render_backends on Juniper to 8

2018-01-08 Thread sroland
From: Roland Scheidegger 

Juniper really has a maximum of 4 RBEs (16 pixels). However, predication
always locks up on my HD 5750, and through experiments it looks like if we're
pretending it has a maximum of 8, with 4 disabled, it works correctly.
My conclusion would be that there's a bug (likely firmware, not hw) which
causes the predication logic to try to read 8 results out of the query buffer
instead of just 4, and since of course noone ever writes the upper 4, the
status bit is never set and hence it will wait for it forever.

Ideally this would be fixed in firmware, but I'd guess chances of that
happening are slim.
This will double the size of (occlusion) query result buffers, write the
status bit for the disabled rbs in these buffers, and will also add 8 results
together instead of just 4 when reading them back. The latter is unnecessary,
but it's probably not worth bothering - luckily num_render_backends isn't
used outside of occlusion queries, so don't need separate value for the
"real" maximum.
Also print out the enabled_rb_mask if it changed from the pre-fixed value
(which is already printed out), just in case there's some more problems
with chips which have some rbs disabled...

This fixes all the lockups with piglit nv_conditional_render tests on my
HD 5750 (all pass).
---
 src/gallium/drivers/r600/r600_query.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_query.c 
b/src/gallium/drivers/r600/r600_query.c
index 699404b10d..6fc00819b1 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -1830,7 +1830,19 @@ void r600_query_fix_enabled_rb_mask(struct 
r600_common_screen *rscreen)
struct r600_resource *buffer;
uint32_t *results;
unsigned i, mask = 0;
-   unsigned max_rbs = ctx->screen->info.num_render_backends;
+   unsigned max_rbs;
+   
+   if (ctx->family == CHIP_JUNIPER) {
+   /*
+* Fix for predication lockups - the chip can only ever have
+* 4 RBs, however it looks like the predication logic assumes
+* there's 8, trying to read results from query buffers never
+* written to. By increasing this number we'll write the
+* status bit for these as per the normal disabled rb logic.
+*/
+   ctx->screen->info.num_render_backends = 8;
+   }
+   max_rbs = ctx->screen->info.num_render_backends;
 
assert(rscreen->chip_class <= CAYMAN);
 
@@ -1901,8 +1913,13 @@ void r600_query_fix_enabled_rb_mask(struct 
r600_common_screen *rscreen)
 
r600_resource_reference(&buffer, NULL);
 
-   if (mask)
+   if (mask) {
+   if (rscreen->debug_flags & DBG_INFO &&
+   mask != rscreen->info.enabled_rb_mask) {
+   printf("enabled_rb_mask (fixed) = 0x%x\n", mask);
+   }
rscreen->info.enabled_rb_mask = mask;
+   }
 }
 
 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] winsys/radeon: fix up default enabled_rb_mask for r600

2018-01-08 Thread sroland
From: Roland Scheidegger 

The logic had two fatal flaws which completely killed the default value.
1) drm will overwrite the value anyway even if the chip can't be handled
2) the default value logic is relying on num_render_backends, which was
filled in later.
Luckily noone is relying on it, but it's a bit confusing seeing the chip clock
printed out there (as hex) with R600_DEBUG=info...
(Albeit radeonsi does not appear to fix up the value. If kernels which don't
handle this query are still supported, radeonsi will still end up with a broken
enabled_rb_mask, I have no idea of the potential results of this there.)
---
 src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index e600199d26..10f2ecc900 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -369,12 +369,6 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
  &ws->info.max_shader_clock);
 ws->info.max_shader_clock /= 1000;
 
-/* Default value. */
-ws->info.enabled_rb_mask = u_bit_consecutive(0, 
ws->info.num_render_backends);
-/* This fails on non-GCN or older kernels: */
-radeon_get_drm_value(ws->fd, RADEON_INFO_SI_BACKEND_ENABLED_MASK, NULL,
- &ws->info.enabled_rb_mask);
-
 ws->num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 
 /* Generation-specific queries. */
@@ -433,6 +427,16 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
   &ws->info.r600_gb_backend_map))
 ws->info.r600_gb_backend_map_valid = true;
 
+/* Default value. */
+ws->info.enabled_rb_mask = u_bit_consecutive(0, 
ws->info.num_render_backends);
+/*
+ * This fails (silently) on non-GCN or older kernels, overwriting the
+ * default enabled_rb_mask with the result of the last query.
+*/
+if (ws->gen >= DRV_SI)
+radeon_get_drm_value(ws->fd, RADEON_INFO_SI_BACKEND_ENABLED_MASK, 
NULL,
+ &ws->info.enabled_rb_mask);
+
 ws->info.has_virtual_memory = false;
 if (ws->info.drm_minor >= 13) {
 uint32_t ib_vm_max_size;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600: RFC: use GET_BUFFER_RESINFO vtx fetch on eg instead of setting up consts

2018-01-03 Thread sroland
From: Roland Scheidegger 

Contrary to what the comment said, this appears to work just fine on my rv770
(tested with piglit textureSize 140 fs/vs samplerBuffer).
I have no clue though if it's actually preferrable to use it (unfortunately
we cannot get rid of the tex constants completely, as we still require them
for cube map txq).
Albeit filling in the format (1 channels or 4?) and the stuff related to mega-
or mini-fetch (what the hell is this...) is just a guess based on other usage
of vtx fetch instructions...

v2: it really needs to be done through texture cache (I botched the
testing because sb optimizations turned it automatically into tc, but
can't rely on it and isn't happening on tes).

Tested-by: Konstantin Kharlamov 
---
 src/gallium/drivers/r600/evergreen_state.c   |  7 ++--
 src/gallium/drivers/r600/r600_asm.c  |  3 +-
 src/gallium/drivers/r600/r600_shader.c   | 59 ++--
 src/gallium/drivers/r600/r600_state_common.c | 39 +++---
 4 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index f5b8e7115d..f645791a2c 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -653,11 +653,12 @@ static void evergreen_fill_buffer_resource_words(struct 
r600_context *rctx,
S_030008_ENDIAN_SWAP(endian);
tex_resource_words[3] = swizzle_res | 
S_03000C_UNCACHED(params->uncached);
/*
-* in theory dword 4 is for number of elements, for use with resinfo,
-* but it seems to utterly fail to work, the amd gpu shader analyser
+* dword 4 is for number of elements, for use with resinfo,
+* albeit the amd gpu shader analyser
 * uses a const buffer to store the element sizes for buffer txq
 */
-   tex_resource_words[4] = 0;
+   tex_resource_words[4] = params->size / stride;
+
tex_resource_words[5] = tex_resource_words[6] = 0;
tex_resource_words[7] = S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER);
 }
diff --git a/src/gallium/drivers/r600/r600_asm.c 
b/src/gallium/drivers/r600/r600_asm.c
index d6bd561f01..92c2bdf27c 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1510,7 +1510,8 @@ int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
 /* common to all 3 families */
 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct 
r600_bytecode_vtx *vtx, unsigned id)
 {
-   bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
+   bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(vtx->op) |
+   S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 8a36bcf1b4..d349c9d7f1 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -6949,31 +6949,48 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx 
*ctx, boolean src_requires_l
 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int 
offset)
 {
struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
-   struct r600_bytecode_alu alu;
int r;
int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
+   int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 
0; // CF_INDEX_1 : CF_INDEX_NONE
 
-   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-   alu.op = ALU_OP1_MOV;
-   alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
-   if (ctx->bc->chip_class >= EVERGREEN) {
-   /* with eg each dword is either buf size or number of cubes */
-   alu.src[0].sel += id / 4;
-   alu.src[0].chan = id % 4;
-   } else {
+   if (ctx->bc->chip_class < EVERGREEN) {
+   struct r600_bytecode_alu alu;
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP1_MOV;
+   alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
/* r600 we have them at channel 2 of the second dword */
alu.src[0].sel += (id * 2) + 1;
alu.src[0].chan = 1;
+   alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
+   tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+   alu.last = 1;
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+   return 0;
+   } else {
+   struct r600_bytecode_vtx vtx;
+   memset(&vtx, 0, sizeof(vtx));
+   vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */
+   vtx.buffer_id = id + R600_MAX_CONST_BU

[Mesa-dev] [PATCH 4/6] r600: RFC: use GET_BUFFER_RESINFO vtx fetch on eg instead of setting up consts

2018-01-02 Thread sroland
From: Roland Scheidegger 

Contrary to what the comment said, this appears to work just fine on my rv770
(tested with piglit textureSize 140 fs/vs samplerBuffer).
I have no clue though if it's actually preferrable to use it (unfortunately
we cannot get rid of the tex constants completely, as we still require them
for cube map txq).
Albeit filling in the format (1 channels or 4?) and the stuff related to mega-
or mini-fetch (what the hell is this...) is just a guess based on other usage
of vtx fetch instructions...
The docs (for eg, not cayman) suggests this has to be done through tc cache
but it seems to work either way (since it actually just fetches the value from
the buffer descriptor I'm not sure why caches would be involved).
---
 src/gallium/drivers/r600/evergreen_state.c   |  7 ++--
 src/gallium/drivers/r600/r600_asm.c  |  3 +-
 src/gallium/drivers/r600/r600_shader.c   | 59 ++--
 src/gallium/drivers/r600/r600_state_common.c | 39 +++---
 4 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index f5b8e7115d..f645791a2c 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -653,11 +653,12 @@ static void evergreen_fill_buffer_resource_words(struct 
r600_context *rctx,
S_030008_ENDIAN_SWAP(endian);
tex_resource_words[3] = swizzle_res | 
S_03000C_UNCACHED(params->uncached);
/*
-* in theory dword 4 is for number of elements, for use with resinfo,
-* but it seems to utterly fail to work, the amd gpu shader analyser
+* dword 4 is for number of elements, for use with resinfo,
+* albeit the amd gpu shader analyser
 * uses a const buffer to store the element sizes for buffer txq
 */
-   tex_resource_words[4] = 0;
+   tex_resource_words[4] = params->size / stride;
+
tex_resource_words[5] = tex_resource_words[6] = 0;
tex_resource_words[7] = S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER);
 }
diff --git a/src/gallium/drivers/r600/r600_asm.c 
b/src/gallium/drivers/r600/r600_asm.c
index d6bd561f01..92c2bdf27c 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1510,7 +1510,8 @@ int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
 /* common to all 3 families */
 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct 
r600_bytecode_vtx *vtx, unsigned id)
 {
-   bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
+   bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(vtx->op) |
+   S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 8a36bcf1b4..51c38a6e00 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -6949,31 +6949,48 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx 
*ctx, boolean src_requires_l
 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int 
offset)
 {
struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
-   struct r600_bytecode_alu alu;
int r;
int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
+   int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 
0; // CF_INDEX_1 : CF_INDEX_NONE
 
-   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-   alu.op = ALU_OP1_MOV;
-   alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
-   if (ctx->bc->chip_class >= EVERGREEN) {
-   /* with eg each dword is either buf size or number of cubes */
-   alu.src[0].sel += id / 4;
-   alu.src[0].chan = id % 4;
-   } else {
+   if (ctx->bc->chip_class < EVERGREEN) {
+   struct r600_bytecode_alu alu;
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP1_MOV;
+   alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
/* r600 we have them at channel 2 of the second dword */
alu.src[0].sel += (id * 2) + 1;
alu.src[0].chan = 1;
+   alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
+   tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+   alu.last = 1;
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+   return 0;
+   } else {
+   struct r600_bytecode_vtx vtx;
+   memset(&vtx, 0, sizeof(vtx));
+   vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */
+   vtx.buffer_id = id + R600_MAX_CONST_

[Mesa-dev] [PATCH 2/6] r600: don't use vtx offset for load_sample_position

2018-01-02 Thread sroland
From: Roland Scheidegger 

The offset looks bogus to me. Albeit in the end it doesn't matter, by the
looks of it offsets smaller than 4 get ignored there (not sure of the rules,
I suppose either non-dword aligned offsets never work there or the offset
must be at least aligned to the size of a single element).
---
 src/gallium/drivers/r600/r600_shader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index e28882b2e5..792da950b3 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1284,7 +1284,7 @@ static int load_sample_position(struct r600_shader_ctx 
*ctx, struct r600_shader_
vtx.num_format_all = 2;
vtx.format_comp_all = 1;
vtx.use_const_fields = 0;
-   vtx.offset = 1; // first element is size of buffer
+   vtx.offset = 0;
vtx.endian = r600_endian_swap(32);
vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
 
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/6] r600: fix sampler indexing with texture buffers sampling

2018-01-02 Thread sroland
From: Roland Scheidegger 

This fixes the new piglit test.
(I could not actually figure out where the hell that index_1 parameter comes
from but in any case it's completely the same as for ordinary texturing...)
While here also fix up the logic for early exit of setting up driver consts.
---
 src/gallium/drivers/r600/r600_shader.c   | 2 ++
 src/gallium/drivers/r600/r600_state_common.c | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 792da950b3..8a36bcf1b4 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -6856,6 +6856,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, 
boolean src_requires_l
struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
int src_gpr, r, i;
int id = tgsi_tex_get_src_gpr(ctx, 1);
+   int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // 
CF_INDEX_1 : CF_INDEX_NONE
 
src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
if (src_requires_loading) {
@@ -6887,6 +6888,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, 
boolean src_requires_l
vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;  
/* SEL_Z */
vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;  
/* SEL_W */
vtx.use_const_fields = 1;
+   vtx.buffer_index_mode = sampler_index_mode;
 
if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
return r;
diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index e9dd80fa96..4429246d31 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1380,8 +1380,8 @@ void eg_setup_buffer_constants(struct r600_context *rctx, 
int shader_type)
}
 
if (!samplers->views.dirty_buffer_constants &&
-   (images && !images->dirty_buffer_constants) &&
-   (buffers && !buffers->dirty_buffer_constants))
+   !(images && images->dirty_buffer_constants) &&
+   !(buffers && buffers->dirty_buffer_constants))
return;
 
if (images)
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/6] r600: increase number of ubos by one to 14

2018-01-02 Thread sroland
From: Roland Scheidegger 

Ideally we'd support 16 (d3d11 requires 15, and mesa subtracts one for non-ubo
constants), but that's kind of impossible (it would be only doable if either
we'd somehow merge the mesa non-ubo constants with the driver constants, or
only use the driver constants with vtx fetch instead of through the kcache
mechanism - the latter probably wouldn't be too bad).
For now just do as the comment already said, place the gs ring (not really
a const buffer in any case) which is only ever referred to through vc fetch
clauses at index 16. Throw in a couple asserts for good measure to make sure
the hw limit isn't exceeded.
---
 src/gallium/drivers/r600/evergreen_state.c |  1 +
 src/gallium/drivers/r600/r600_asm.c|  1 +
 src/gallium/drivers/r600/r600_pipe.h   | 10 ++
 src/gallium/drivers/r600/r600_state.c  |  1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 81b7c4a285..f5b8e7115d 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2168,6 +2168,7 @@ static void evergreen_emit_constant_buffers(struct 
r600_context *rctx,
va = rbuffer->gpu_address + cb->buffer_offset;
 
if (!gs_ring_buffer) {
+   assert(buffer_index < R600_MAX_HW_CONST_BUFFERS);
radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + 
buffer_index * 4,

DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
radeon_set_context_reg_flag(cs, reg_alu_const_cache + 
buffer_index * 4, va >> 8,
diff --git a/src/gallium/drivers/r600/r600_asm.c 
b/src/gallium/drivers/r600/r600_asm.c
index 69b2d142c1..d6bd561f01 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -1008,6 +1008,7 @@ static int r600_bytecode_alloc_inst_kcache_lines(struct 
r600_bytecode *bc,
continue;
 
bank = alu->src[i].kc_bank;
+   assert(bank < R600_MAX_HW_CONST_BUFFERS);
line = (sel-512)>>4;
index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / 
V_SQ_CF_INDEX_NONE
 
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index e042edf2b4..cb84bc1998 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -69,11 +69,12 @@
 #define R600_MAX_DRAW_CS_DWORDS58
 #define R600_MAX_PFP_SYNC_ME_DWORDS16
 
-#define R600_MAX_USER_CONST_BUFFERS 13
+#define EG_MAX_ATOMIC_BUFFERS 8
+
+#define R600_MAX_USER_CONST_BUFFERS 14
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
 #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + 
R600_MAX_DRIVER_CONST_BUFFERS)
-
-#define EG_MAX_ATOMIC_BUFFERS 8
+#define R600_MAX_HW_CONST_BUFFERS 16
 
 /* start driver buffers after user buffers */
 #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS)
@@ -84,7 +85,8 @@
 #define R600_LDS_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS + 1)
 /*
  * Note GS doesn't use a constant buffer binding, just a resource index,
- * so it's fine to have it exist at index 16.
+ * so it's fine to have it exist at index 16. I.e. it's not actually
+ * a const buffer, just a buffer resource.
  */
 #define R600_GS_RING_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS + 2)
 /* Currently R600_MAX_CONST_BUFFERS just fits on the hw, which has a limit
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index 253ff57a98..89cf7d2e50 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1712,6 +1712,7 @@ static void r600_emit_constant_buffers(struct 
r600_context *rctx,
offset = cb->buffer_offset;
 
if (!gs_ring_buffer) {
+   assert(buffer_index < R600_MAX_HW_CONST_BUFFERS);
radeon_set_context_reg(cs, reg_alu_constbuf_size + 
buffer_index * 4,
   DIV_ROUND_UP(cb->buffer_size, 
256));
radeon_set_context_reg(cs, reg_alu_const_cache + 
buffer_index * 4, offset >> 8);
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/6] r600: increase number of UBOs to 15

2018-01-02 Thread sroland
From: Roland Scheidegger 

With the exception of the default tess levels only ever accessed
by the default tcs shader, the LDS_INFO const buffer was only accessed by vtx
instructions, and not through kcache. No idea why really, but use this to our
advantage by not using a constant buffer slot for it. This just requires us to
throw the default tess levels into the "normal" driver const buffer instead.
Alternatively, could acesss those constants via vtx instructions too, but then
we couldn't use a ordinary ureg prog accessing them as constants and would have
to generate that directly when compiling the default tcs shader. (Another
alternative would be to put all lds info into the ordinary driver const
buffer, albeit we'd maybe need to increase the fixed size as it can't fit
alongside the ucp since vs needs access to the lds info too.)
---
 src/gallium/drivers/r600/evergreen_state.c   | 15 --
 src/gallium/drivers/r600/r600_pipe.h | 13 
 src/gallium/drivers/r600/r600_state_common.c | 31 +---
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index f645791a2c..4cc48dfa11 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2168,8 +2168,7 @@ static void evergreen_emit_constant_buffers(struct 
r600_context *rctx,
 
va = rbuffer->gpu_address + cb->buffer_offset;
 
-   if (!gs_ring_buffer) {
-   assert(buffer_index < R600_MAX_HW_CONST_BUFFERS);
+   if (buffer_index < R600_MAX_HW_CONST_BUFFERS) {
radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + 
buffer_index * 4,

DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
radeon_set_context_reg_flag(cs, reg_alu_const_cache + 
buffer_index * 4, va >> 8,
@@ -3880,7 +3879,7 @@ static void evergreen_set_tess_state(struct pipe_context 
*ctx,
 
memcpy(rctx->tess_state, default_outer_level, sizeof(float) * 4);
memcpy(rctx->tess_state+4, default_inner_level, sizeof(float) * 2);
-   rctx->tess_state_dirty = true;
+   rctx->driver_consts[PIPE_SHADER_TESS_CTRL].tcs_default_levels_dirty = 
true;
 }
 
 static void evergreen_setup_immed_buffer(struct r600_context *rctx,
@@ -4344,7 +4343,7 @@ void evergreen_setup_tess_constants(struct r600_context 
*rctx, const struct pipe
unsigned input_vertex_size, output_vertex_size;
unsigned input_patch_size, pervertex_output_patch_size, 
output_patch_size;
unsigned output_patch0_offset, perpatch_output_offset, lds_size;
-   uint32_t values[16];
+   uint32_t values[8];
unsigned num_waves;
unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
unsigned wave_divisor = (16 * num_pipes);
@@ -4364,7 +4363,6 @@ void evergreen_setup_tess_constants(struct r600_context 
*rctx, const struct pipe
 
if (rctx->lds_alloc != 0 &&
rctx->last_ls == ls &&
-   !rctx->tess_state_dirty &&
rctx->last_num_tcs_input_cp == num_tcs_input_cp &&
rctx->last_tcs == tcs)
return;
@@ -4411,17 +4409,12 @@ void evergreen_setup_tess_constants(struct r600_context 
*rctx, const struct pipe
 
rctx->lds_alloc = (lds_size | (num_waves << 14));
 
-   memcpy(&values[8], rctx->tess_state, 6 * sizeof(float));
-   values[14] = 0;
-   values[15] = 0;
-
-   rctx->tess_state_dirty = false;
rctx->last_ls = ls;
rctx->last_tcs = tcs;
rctx->last_num_tcs_input_cp = num_tcs_input_cp;
 
constbuf.user_buffer = values;
-   constbuf.buffer_size = 16 * 4;
+   constbuf.buffer_size = 8 * 4;
 
rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_VERTEX,
  R600_LDS_INFO_CONST_BUFFER, &constbuf);
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index cb84bc1998..112b5cbb83 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -71,7 +71,7 @@
 
 #define EG_MAX_ATOMIC_BUFFERS 8
 
-#define R600_MAX_USER_CONST_BUFFERS 14
+#define R600_MAX_USER_CONST_BUFFERS 15
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
 #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + 
R600_MAX_DRIVER_CONST_BUFFERS)
 #define R600_MAX_HW_CONST_BUFFERS 16
@@ -80,12 +80,17 @@
 #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS)
 #define R600_UCP_SIZE (4*4*8)
 #define R600_CS_BLOCK_GRID_SIZE (8 * 4)
+#define R600_TCS_DEFAULT_LEVELS_SIZE (6 * 4)
 #define R600_BUFFER_INFO_OFFSET (R600_UCP_SIZE)
 
+/*
+ * We only access this buffer through vtx clauses hence it's fine to exist
+ * at index beyond 15.
+ */
 #define R600_LDS_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS + 1)
 /*
  * Note GS doesn't use a constant buffer b

[Mesa-dev] [PATCH 6/6] r600: don't emit tes samplers/views when tes isn't active

2018-01-02 Thread sroland
From: Roland Scheidegger 

Similar to const buffers. The driver must not emit any tes-related state if tes
is disabled, since the hw slots are all shared by VS, therefore it would
overwrite them (the mesa state tracker might not do this, but it would be
perfectly legal to do so).
Nevertheless I think the dirty state tracking logic in the driver is
fundamentally flawed when tes is disabled/enabled, since it looks to me like
the VS (and TES) state would not get reemitted to the correct slots (if it's
not dirty anyway). Unless I'm missing something...
Theoretically, the overwrite problem could be solved by using non-overlapping
resource slots for TES and VS (since we're not even close to using half the
resource slots), but it wouldn't work for constant buffers nor samplers, and
for VS would still need to propagate changes to both LS and VS, so probably
not a useful idea.
Unfortunately there's zero coverage of this with piglit, since all tessellation
shader tests are just shader_runner tests, which are unsuitable for testing
any kind of state dependency tracking issues (so I can't even quickly hack
something up to proove it and fix it...).
TCS otoh is just fine - like GS it has its own hw slots.
---
 src/gallium/drivers/r600/evergreen_state.c   |  4 
 src/gallium/drivers/r600/r600_state_common.c | 15 +++
 2 files changed, 19 insertions(+)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 4cc48dfa11..fb1de9cbf4 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2334,6 +2334,8 @@ static void evergreen_emit_tcs_sampler_views(struct 
r600_context *rctx, struct r
 
 static void evergreen_emit_tes_sampler_views(struct r600_context *rctx, struct 
r600_atom *atom)
 {
+   if (!rctx->tes_shader)
+   return;
evergreen_emit_sampler_views(rctx, 
&rctx->samplers[PIPE_SHADER_TESS_EVAL].views,
 EG_FETCH_CONSTANTS_OFFSET_VS + 
R600_MAX_CONST_BUFFERS, 0);
 }
@@ -2404,6 +2406,8 @@ static void evergreen_emit_tcs_sampler_states(struct 
r600_context *rctx, struct
 
 static void evergreen_emit_tes_sampler_states(struct r600_context *rctx, 
struct r600_atom *atom)
 {
+   if (!rctx->tes_shader)
+   return;
evergreen_emit_sampler_states(rctx, 
&rctx->samplers[PIPE_SHADER_TESS_EVAL], 18,
  R_00A414_TD_VS_SAMPLER0_BORDER_INDEX, 0);
 }
diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index 4364350487..a434156c16 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1723,6 +1723,21 @@ static bool r600_update_derived_state(struct 
r600_context *rctx)
UPDATE_SHADER_CLIP(R600_HW_STAGE_VS, vs);
}
}
+   
+   /*
+* XXX: I believe there's some fatal flaw in the dirty state logic when
+* enabling/disabling tes.
+* VS/ES share all buffer/resource/sampler slots. If TES is enabled,
+* it will therefore overwrite the VS slots. If it now gets disabled,
+* the VS needs to rebind all buffer/resource/sampler slots - not only
+* has TES overwritten the corresponding slots, but when the VS was
+* operating as LS the things with correpsonding dirty bits got bound
+* to LS slots and won't reflect what is dirty as VS stage even if the
+* TES didn't overwrite it. The story for re-enabled TES is similar.
+* In any case, we're not allowed to submit any TES state when
+* TES is disabled (the state tracker may not do this but this looks
+* like an optimization to me, not something which can be relied on).
+*/
 
/* Update clip misc state. */
if (clip_so_current) {
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] r600: set up constants needed for txq for buffers and cube maps with tes

2017-12-31 Thread sroland
From: Roland Scheidegger 

We only did this for the other stages, but obviously tess eval/ctrl need it
too.
This fixes the (newly modified) piglit texturing/textureSize test when run
with tes stage and bufferSampler.
---
 src/gallium/drivers/r600/r600_state_common.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index e7fa1bbf57..e9dd80fa96 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1812,6 +1812,22 @@ static bool r600_update_derived_state(struct 
r600_context *rctx)
}
}
 
+   if (rctx->tes_shader) {
+   assert(rctx->b.chip_class >= EVERGREEN);
+   need_buf_const = 
rctx->tes_shader->current->shader.uses_tex_buffers ||
+
rctx->tes_shader->current->shader.has_txq_cube_array_z_comp;
+   if (need_buf_const) {
+   eg_setup_buffer_constants(rctx, PIPE_SHADER_TESS_EVAL);
+   }
+   if (rctx->tcs_shader) {
+   need_buf_const = 
rctx->tcs_shader->current->shader.uses_tex_buffers ||
+
rctx->tcs_shader->current->shader.has_txq_cube_array_z_comp;
+   if (need_buf_const) {
+   eg_setup_buffer_constants(rctx, 
PIPE_SHADER_TESS_CTRL);
+   }
+   }
+   }
+
r600_update_driver_const_buffers(rctx, false);
 
if (rctx->b.chip_class < EVERGREEN && rctx->ps_shader && 
rctx->vs_shader) {
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] r600: support 32 vertex attribs for evergreen

2017-12-31 Thread sroland
From: Roland Scheidegger 

Evergreen clearly has 32 slots, so it should just work (and the affected array
is already sized with PIPE_MAX_ATTRIB).
Note: As dx10.1 chips, r600/r700 should support this too, but seemingly there's
only 16 resource slots for fetch shaders (fs). However, a quick looks seems to
suggest the fs slots are actually shared with vs and not separate (as the fetch
shader uses a offset of 160 on these chips), therefore (we're not even close
to using all vs slots) just using different offsets might work, but I cannot
verify this.

No piglit change.
---
 src/gallium/drivers/r600/r600_pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index 2583c719a3..c294973e8b 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -559,7 +559,7 @@ static int r600_get_shader_param(struct pipe_screen* 
pscreen,
case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
return 32;
case PIPE_SHADER_CAP_MAX_INPUTS:
-   return shader == PIPE_SHADER_VERTEX ? 16 : 32;
+   return shader == PIPE_SHADER_VERTEX ? (rscreen->b.family >= 
CHIP_CEDAR ? 32 : 16) : 32;
case PIPE_SHADER_CAP_MAX_OUTPUTS:
return shader == PIPE_SHADER_FRAGMENT ? 8 : 32;
case PIPE_SHADER_CAP_MAX_TEMPS:
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] r600: don't emit reloc for ring buffer out into the blue

2017-12-31 Thread sroland
From: Roland Scheidegger 

It looks like this reloc belongs to setting the constant reg, which is skipped
for gs ring.
---
 src/gallium/drivers/r600/evergreen_state.c | 7 +++
 src/gallium/drivers/r600/r600_state.c  | 7 +++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 0da665f634..81b7c4a285 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2172,12 +2172,11 @@ static void evergreen_emit_constant_buffers(struct 
r600_context *rctx,

DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags);
radeon_set_context_reg_flag(cs, reg_alu_const_cache + 
buffer_index * 4, va >> 8,
pkt_flags);
+   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
+   radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, 
&rctx->b.gfx, rbuffer,
+ 
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
}
 
-   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-   radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, 
&rctx->b.gfx, rbuffer,
- RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER));
-
radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags);
radeon_emit(cs, (buffer_id_base + buffer_index) * 8);
radeon_emit(cs, va); /* RESOURCEi_WORD0 */
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index cbf860f45f..253ff57a98 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1715,12 +1715,11 @@ static void r600_emit_constant_buffers(struct 
r600_context *rctx,
radeon_set_context_reg(cs, reg_alu_constbuf_size + 
buffer_index * 4,
   DIV_ROUND_UP(cb->buffer_size, 
256));
radeon_set_context_reg(cs, reg_alu_const_cache + 
buffer_index * 4, offset >> 8);
+   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+   radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, 
&rctx->b.gfx, rbuffer,
+ 
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
}
 
-   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-   radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, 
&rctx->b.gfx, rbuffer,
- RADEON_USAGE_READ, 
RADEON_PRIO_CONST_BUFFER));
-
radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0));
radeon_emit(cs, (buffer_id_base + buffer_index) * 7);
radeon_emit(cs, offset); /* RESOURCEi_WORD0 */
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] r600: kill off native_integer shader ctx flag

2017-12-22 Thread sroland
From: Roland Scheidegger 

Maybe upon a time it wasn't always true.
---
 src/gallium/drivers/r600/r600_shader.c | 18 --
 1 file changed, 18 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 06d7ca02e9..6cdbfd3063 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -350,7 +350,6 @@ struct r600_shader_ctx {
int cs_grid_size_reg;
bool cs_block_size_loaded, cs_grid_size_loaded;
int fragcoord_input;
-   int native_integers;
int next_ring_offset;
int gs_out_ring_offset;
int gs_next_vertex;
@@ -998,22 +997,6 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
break; /* Already handled from 
allocate_system_value_inputs */
} else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
-   if (!ctx->native_integers) {
-   struct r600_bytecode_alu alu;
-   memset(&alu, 0, sizeof(struct 
r600_bytecode_alu));
-
-   alu.op = ALU_OP1_INT_TO_FLT;
-   alu.src[0].sel = 0;
-   alu.src[0].chan = 3;
-
-   alu.dst.sel = 0;
-   alu.dst.chan = 3;
-   alu.dst.write = 1;
-   alu.last = 1;
-
-   if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
-   return r;
-   }
break;
} else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
break;
@@ -3128,7 +3111,6 @@ static int r600_shader_from_tgsi(struct r600_context 
*rctx,
 
ctx.bc = &shader->bc;
ctx.shader = shader;
-   ctx.native_integers = true;
 
r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
   rscreen->has_compressed_msaa_texturing);
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] r600: fix textureSize queries with tbos

2017-12-22 Thread sroland
From: Roland Scheidegger 

piglit doesn't care, but I'm quite confident that the size actually bound
as range should be reported and not the base size of the resource.
Also, the array in the constant buffer looks overallocated by a factor of 4.
For eg, also decrease the size by another factor of 2 by using the same
constant slot for both buffer size (required for txq for TBOs) and the number
of layers for cube arrays, as these are mutually exclusive. Could of course use
some more logic and only actually do this for the samplers/images/buffers where
it's required rather than for all, but ah well...
(FWIW I believe the txq for TBOs would be fixable on EG without using a
constant buffer by using the GET_BUFFER_RESINFO vc fetch, but for cube map
arrays we'd still need the buffer as it's unfixable since the hw requires
always 0 unfortunately.)
---
 src/gallium/drivers/r600/r600_shader.c   | 18 +++---
 src/gallium/drivers/r600/r600_state_common.c | 35 +---
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 6cdbfd3063..8a63621c2f 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -6955,9 +6955,9 @@ static int r600_do_buffer_txq(struct r600_shader_ctx 
*ctx, int reg_idx, int offs
alu.op = ALU_OP1_MOV;
alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
if (ctx->bc->chip_class >= EVERGREEN) {
-   /* channel 0 or 2 of each word */
-   alu.src[0].sel += (id / 2);
-   alu.src[0].chan = (id % 2) * 2;
+   /* with eg each dword is either buf size or number of cubes */
+   alu.src[0].sel += id / 4;
+   alu.src[0].chan = id % 4;
} else {
/* r600 we have them at channel 2 of the second dword */
alu.src[0].sel += (id * 2) + 1;
@@ -7615,9 +7615,9 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 
alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
if (ctx->bc->chip_class >= EVERGREEN) {
-   /* channel 1 or 3 of each word */
-   alu.src[0].sel += (id / 2);
-   alu.src[0].chan = ((id % 2) * 2) + 1;
+   /* with eg each dword is either buf size or number of 
cubes */
+   alu.src[0].sel += id / 4;
+   alu.src[0].chan = id % 4;
} else {
/* r600 we have them at channel 2 of the second dword */
alu.src[0].sel += (id * 2) + 1;
@@ -8782,9 +8782,9 @@ static int tgsi_resq(struct r600_shader_ctx *ctx)
alu.op = ALU_OP1_MOV;
 
alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
-   /* channel 1 or 3 of each word */
-   alu.src[0].sel += (id / 2);
-   alu.src[0].chan = ((id % 2) * 2) + 1;
+   /* with eg each dword is either buf size or number of cubes */
+   alu.src[0].sel += id / 4;
+   alu.src[0].chan = id % 4;
alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
alu.last = 1;
diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index e5a5a33367..e9996cb3fa 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -902,7 +902,6 @@ struct r600_pipe_shader_selector 
*r600_create_shader_state_tokens(struct pipe_co
  unsigned 
pipe_shader_type)
 {
struct r600_pipe_shader_selector *sel = 
CALLOC_STRUCT(r600_pipe_shader_selector);
-   int i;
 
sel->type = pipe_shader_type;
sel->tokens = tgsi_dup_tokens(tokens);
@@ -1326,7 +1325,7 @@ static void r600_setup_buffer_constants(struct 
r600_context *rctx, int shader_ty
samplers->views.dirty_buffer_constants = FALSE;
 
bits = util_last_bit(samplers->views.enabled_mask);
-   array_size = bits * 8 * sizeof(uint32_t) * 4;
+   array_size = bits * 8 * sizeof(uint32_t);
 
constants = r600_alloc_buf_consts(rctx, shader_type, array_size, 
&base_offset);
 
@@ -1349,7 +1348,8 @@ static void r600_setup_buffer_constants(struct 
r600_context *rctx, int shader_ty
} else
constants[offset + 4] = 0;
 
-   constants[offset + 5] = 
samplers->views.views[i]->base.texture->width0 / 
util_format_get_blocksize(samplers->views.views[i]->base.format);
+   constants[offset + 5] = 
samplers->views.views[i]->base.u.buf.size /
+   
util_format_get_blocksize(samplers->views.views[i]->base.format);
constants[offset + 6] = 
samplers->views.views[

[Mesa-dev] [PATCH 1/2] gallivm: implement accurate corner behavior for textureGather with cube maps

2017-12-12 Thread sroland
From: Roland Scheidegger 

The spec says the missing texel (when we wrap around both x and y axis)
should be synthesized as the average of the 3 other texels. For bilinear
filtering however we instead adjusted the filter weights (because, while
the complexity looks similar, there would be 4 times as many color values
to fix up than weights). Obviously this could not work for gather (hence
accurate corner filtering was disabled with gather).
Implement this by just doing it as the spec implies - calculate the 4th
texel as the average of the other 3. With gather of course there's only
one color to worry about, so it's not all that many instructions neither
(albeit surely the whole cube map filtering is hilariously complex).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 304 ++
 1 file changed, 201 insertions(+), 103 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index def731e..571a968 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1030,20 +1030,13 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
LLVMValueRef neighbors[2][2][4];
int chan, texel_index;
boolean seamless_cube_filter, accurate_cube_corners;
+   unsigned chan_swiz = bld->static_texture_state->swizzle_r;
 
seamless_cube_filter = (bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE ||
bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE_ARRAY) &&
   bld->static_sampler_state->seamless_cube_map;
-   /*
-* XXX I don't know how this is really supposed to work with gather. From GL
-* spec wording (not gather specific) it sounds like the 4th missing texel
-* should be an average of the other 3, hence for gather could return this.
-* This is however NOT how the code here works, which just fixes up the
-* weights used for filtering instead. And of course for gather there is
-* no filter to tweak...
-*/
-   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
-   !is_gather;
+
+   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
 
lp_build_extract_image_sizes(bld,
 &bld->int_size_bld,
@@ -1371,94 +1364,191 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
* as well) here.
*/
   if (accurate_cube_corners) {
- LLVMValueRef w00, w01, w10, w11, wx0, wy0;
- LLVMValueRef c_weight, c00, c01, c10, c11;
- LLVMValueRef have_corner, one_third, tmp;
+ LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
+ LLVMValueRef have_corner, one_third;
 
- colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
- colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
- colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
- colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+ colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, 
"cs0");
+ colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, 
"cs1");
+ colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, 
"cs2");
+ colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, 
"cs3");
 
  have_corner = LLVMBuildLoad(builder, have_corners, "");
 
  lp_build_if(&corner_if, bld->gallivm, have_corner);
 
- /*
-  * we can't use standard 2d lerp as we need per-element weight
-  * in case of corners, so just calculate bilinear result as
-  * w00*s00 + w01*s01 + w10*s10 + w11*s11.
-  * (This is actually less work than using 2d lerp, 7 vs. 9 
instructions,
-  * however calculating the weights needs another 6, so actually 
probably
-  * not slower than 2d lerp only for 4 channels as weights only need
-  * to be calculated once - of course fixing the weights has 
additional cost.)
-  */
- wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
- wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
- w00 = lp_build_mul(coord_bld, wx0, wy0);
- w01 = lp_build_mul(coord_bld, s_fpart, wy0);
- w10 = lp_build_mul(coord_bld, wx0, t_fpart);
- w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
-
- /* find corner weight */
+ one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
+1.0f/3.0f);
+
+ /* find corner */
  c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
- c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
+ c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
  c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
- 

[Mesa-dev] [PATCH 2/2] gallivm: fix an issue with NaNs with seamless cube filtering

2017-12-12 Thread sroland
From: Roland Scheidegger 

Cube texture wrapping is a bit special since the values (post face
projection) always are within [0,1], so we took advantage of that and
omitted some clamps.
However, we can still get NaNs (either because the coords already had NaNs,
or the face projection generated them), and in fact we didn't handle them
quite safely. I've seen -INT_MAX + 1 been propagated through as the final int
coord value, albeit I didn't observe a crash. (Not quite a coincidence, since
any stride mul with -INT_MAX or -INT_MAX+1 will turn up as a small positive
number - nevertheless, I'd rather not try my luck, I'm not entirely sure it
can't really turn up negative neither due to seamless coord swapping, plus
ifloor of a NaN is not guaranteed to return -INT_MAX by any standard. And
we kill off NaNs similarly with ordinary texture wrapping too.)
So kill off the NaNs by using the common max against zero method.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 571a968..ff8cbf6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1123,6 +1123,17 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
*/
   /* should always have normalized coords, and offsets are undefined */
   assert(bld->static_sampler_state->normalized_coords);
+  /*
+   * The coords should all be between [0,1] however we can have NaNs,
+   * which will wreak havoc. In particular the y1_clamped value below
+   * can be -INT_MAX (on x86) and be propagated right through (probably
+   * other values might be bogus in the end too).
+   * So kill off the NaNs here.
+   */
+  coords[0] = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
+   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+  coords[1] = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
+   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
   coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
   /* instead of clamp, build mask if overflowed */
   coord = lp_build_sub(coord_bld, coord, half);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix texture wrapping for texture gather for mirror modes

2017-12-09 Thread sroland
From: Roland Scheidegger 

Care must be taken that all coords end up correct, the tests are very
sensitive that everything is correctly rounded. This doesn't matter
for bilinear filter (since picking a wrong texel with weight zero is
ok), and we could also switch the per-sample coords mistakenly.
While here, also optimize the coord_mirror helper a bit (we can do the
mirroring directly by exploiting float rounding, no need for fixing up
odd/even manually).
I did not touch the mirror_clamp and mirror_clamp_to_border modes.
In contrast to mirror_clamp_to_edge and mirror_repeat these are legacy
modes. They are specified against old gl rules, which actually does
the mirroring not per sample (so you get swapped order if the coord
is in the mirrored section). I think the idea though is that they should
follow the respecified mirror_clamp_to_edge rules so the order would be
correct.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 242 +++---
 1 file changed, 169 insertions(+), 73 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index b67a089..3605c77 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -218,34 +218,42 @@ lp_build_sample_texel_soa(struct lp_build_sample_context 
*bld,
 
 
 /**
- * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
+ * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
+ * (Note that with pot sizes could do this much more easily post-scale
+ * with some bit arithmetic.)
  */
 static LLVMValueRef
 lp_build_coord_mirror(struct lp_build_sample_context *bld,
-  LLVMValueRef coord)
+  LLVMValueRef coord, boolean posOnly)
 {
struct lp_build_context *coord_bld = &bld->coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef fract, flr, isOdd;
-
-   lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
-   /* kill off NaNs */
-   /* XXX: not safe without arch rounding, fract can be anything. */
-   fract = lp_build_max_ext(coord_bld, fract, coord_bld->zero,
-GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-
-   /* isOdd = flr & 1 */
-   isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, "");
+   LLVMValueRef fract;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
 
-   /* make coord positive or negative depending on isOdd */
-   /* XXX slight overkill masking out sign bit is unnecessary */
-   coord = lp_build_set_sign(coord_bld, fract, isOdd);
+   /*
+* We can just use 2*(x - round(0.5*x)) to do all the mirroring,
+* it all works out. (The result is in range [-1, 1.0], negative if
+* the coord is in the "odd" section, otherwise positive.)
+*/
 
-   /* convert isOdd to float */
-   isOdd = lp_build_int_to_float(coord_bld, isOdd);
+   coord = lp_build_mul(coord_bld, coord, half);
+   fract = lp_build_round(coord_bld, coord);
+   fract = lp_build_sub(coord_bld, coord, fract);
+   coord = lp_build_add(coord_bld, fract, fract);
 
-   /* add isOdd to coord */
-   coord = lp_build_add(coord_bld, coord, isOdd);
+   if (posOnly) {
+  /*
+   * Theoretically it's not quite 100% accurate because the spec says
+   * that ultimately a scaled coord of -x.0 should map to int coord
+   * -x + 1 with mirroring, not -x (this does not matter for bilinear
+   * filtering).
+   */
+  coord = lp_build_abs(coord_bld, coord);
+  /* kill off NaNs */
+  /* XXX: not safe without arch rounding, fract can be anything. */
+  coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
+   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
+   }
 
return coord;
 }
@@ -363,6 +371,11 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context 
*bld,
   }
 
   /* clamp to [0, length] */
+  /*
+   * Unlike some other wrap modes, this should be correct for gather
+   * too. GL_CLAMP explicitly does this clamp on the coord prior to
+   * actual wrapping (which is per sample).
+   */
   coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
 
   coord = lp_build_sub(coord_bld, coord, half);
@@ -426,8 +439,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context 
*bld,
  offset = lp_build_int_to_float(coord_bld, offset);
  coord = lp_build_add(coord_bld, coord, offset);
   }
-  /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
-  /* can skip clamp (though might not work for very large coord values) */
+  /*
+   * We don't need any clamp. Technically, for very large (pos or neg)
+   * (or infinite) values, clamp against [-length, length] would be
+   * correct, but we don't need to guarantee any specific
+   * result for such coords (the ifloor will be undefined, but for modes
+  

[Mesa-dev] [PATCH] r600: set DX10_CLAMP for compute shader too

2017-11-21 Thread sroland
From: Roland Scheidegger 

I really intended to set this for all shader stages by
3835009796166968750ff46cf209f6d4208cda86 but missed it for compute shaders
(because it's in a different source file...).
---
 src/gallium/drivers/r600/evergreen_compute.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_compute.c 
b/src/gallium/drivers/r600/evergreen_compute.c
index 6e87539..48c4a9c 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -746,8 +746,9 @@ void evergreen_emit_cs_shader(struct r600_context *rctx,
radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
radeon_emit(cs,   /* R_0288D4_SQ_PGM_RESOURCES_LS */
-   S_0288D4_NUM_GPRS(ngpr)
-   | S_0288D4_STACK_SIZE(nstack));
+   S_0288D4_NUM_GPRS(ngpr) |
+   S_0288D4_DX10_CLAMP(1) |
+   S_0288D4_STACK_SIZE(nstack));
radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 
radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: fix snorm blending

2017-11-17 Thread sroland
From: Roland Scheidegger 

The blend math gets a bit funky due to inverse blend factors being
in range [0,2] rather than [-1,1], our normalized math can't really
cover this.
src_alpha_saturate blend factor has a similar problem too.
(Note that piglit fbo-blending-formats test is mostly useless for
anything but unorm formats, since not just all src/dst values are
between [0,1], but the tests are crafted in a way that the results
are between [0,1] too.)

v2: some formatting fixes, and fix a fairly obscure (to debug)
issue with alpha-only formats (not related to snorm at all), where
blend optimization would think it could simplify the blend equation
if the blend factors were complementary, however was using the
completely unrelated rgb blend factors instead of the alpha ones...
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c |  50 -
 src/gallium/auxiliary/gallivm/lp_bld_arit.h |   7 ++
 src/gallium/drivers/llvmpipe/lp_bld_blend.c | 130 ++--
 src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |  53 ++
 4 files changed, 187 insertions(+), 53 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index a1edd34..321c6e4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -541,38 +541,38 @@ lp_build_add(struct lp_build_context *bld,
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
 
-   if(a == bld->zero)
+   if (a == bld->zero)
   return b;
-   if(b == bld->zero)
+   if (b == bld->zero)
   return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
   return bld->undef;
 
-   if(bld->type.norm) {
+   if (type.norm) {
   const char *intrinsic = NULL;
 
-  if(a == bld->one || b == bld->one)
+  if (!type.sign && (a == bld->one || b == bld->one))
 return bld->one;
 
   if (!type.floating && !type.fixed) {
  if (type.width * type.length == 128) {
-if(util_cpu_caps.has_sse2) {
-  if(type.width == 8)
+if (util_cpu_caps.has_sse2) {
+  if (type.width == 8)
 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : 
"llvm.x86.sse2.paddus.b";
-  if(type.width == 16)
+  if (type.width == 16)
 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : 
"llvm.x86.sse2.paddus.w";
 } else if (util_cpu_caps.has_altivec) {
-  if(type.width == 8)
+  if (type.width == 8)
  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : 
"llvm.ppc.altivec.vaddubs";
-  if(type.width == 16)
+  if (type.width == 16)
  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : 
"llvm.ppc.altivec.vadduhs";
 }
  }
  if (type.width * type.length == 256) {
-if(util_cpu_caps.has_avx2) {
-  if(type.width == 8)
+if (util_cpu_caps.has_avx2) {
+  if (type.width == 8)
 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : 
"llvm.x86.avx2.paddus.b";
-  if(type.width == 16)
+  if (type.width == 16)
 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : 
"llvm.x86.avx2.paddus.w";
 }
  }
@@ -842,38 +842,38 @@ lp_build_sub(struct lp_build_context *bld,
assert(lp_check_value(type, a));
assert(lp_check_value(type, b));
 
-   if(b == bld->zero)
+   if (b == bld->zero)
   return a;
-   if(a == bld->undef || b == bld->undef)
+   if (a == bld->undef || b == bld->undef)
   return bld->undef;
-   if(a == b)
+   if (a == b)
   return bld->zero;
 
-   if(bld->type.norm) {
+   if (type.norm) {
   const char *intrinsic = NULL;
 
-  if(b == bld->one)
+  if (!type.sign && b == bld->one)
 return bld->zero;
 
   if (!type.floating && !type.fixed) {
  if (type.width * type.length == 128) {
 if (util_cpu_caps.has_sse2) {
-  if(type.width == 8)
+  if (type.width == 8)
  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : 
"llvm.x86.sse2.psubus.b";
-  if(type.width == 16)
+  if (type.width == 16)
  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : 
"llvm.x86.sse2.psubus.w";
 } else if (util_cpu_caps.has_altivec) {
-  if(type.width == 8)
+  if (type.width == 8)
  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : 
"llvm.ppc.altivec.vsububs";
-  if(type.width == 16)
+  if (type.width == 16)
  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : 
"llvm.ppc.altivec.vsubuhs";
 }
  }
  if (type.width * type.length == 256) {
 if (util_cpu_caps.has_avx2) {
-  if(type.width == 8)
+  if (type.width == 8)
   

[Mesa-dev] [PATCH] llvmpipe: fix snorm blending

2017-11-16 Thread sroland
From: Roland Scheidegger 

The blend math gets a bit funky due to inverse blend factors being
in range [0,2] rather than [-1,1], our normalized math can't really
cover this.
src_alpha_saturate blend factor has a similar problem too.
(Note that piglit fbo-blending-formats test is mostly useless for
anything but unorm formats, since not just all src/dst values are
between [0,1], but the tests are crafted in a way that the results
are between [0,1] too.)
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c |  10 +-
 src/gallium/auxiliary/gallivm/lp_bld_arit.h |   7 ++
 src/gallium/drivers/llvmpipe/lp_bld_blend.c | 120 +++-
 src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |  28 --
 4 files changed, 149 insertions(+), 16 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index a1edd34..628dedd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -548,10 +548,10 @@ lp_build_add(struct lp_build_context *bld,
if(a == bld->undef || b == bld->undef)
   return bld->undef;
 
-   if(bld->type.norm) {
+   if(type.norm) {
   const char *intrinsic = NULL;
 
-  if(a == bld->one || b == bld->one)
+  if(!type.sign && (a == bld->one || b == bld->one))
 return bld->one;
 
   if (!type.floating && !type.fixed) {
@@ -849,10 +849,10 @@ lp_build_sub(struct lp_build_context *bld,
if(a == b)
   return bld->zero;
 
-   if(bld->type.norm) {
+   if(type.norm) {
   const char *intrinsic = NULL;
 
-  if(b == bld->one)
+  if(!type.sign && b == bld->one)
 return bld->zero;
 
   if (!type.floating && !type.fixed) {
@@ -963,7 +963,7 @@ lp_build_sub(struct lp_build_context *bld,
  * @sa Michael Herf, The "double blend trick", May 2000, 
  * http://www.stereopsis.com/doubleblend.html
  */
-static LLVMValueRef
+LLVMValueRef
 lp_build_mul_norm(struct gallivm_state *gallivm,
   struct lp_type wide_type,
   LLVMValueRef a, LLVMValueRef b)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 2a4137a..f5b2800 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -71,6 +71,13 @@ lp_build_sub(struct lp_build_context *bld,
  LLVMValueRef a,
  LLVMValueRef b);
 
+
+LLVMValueRef
+lp_build_mul_norm(struct gallivm_state *gallivm,
+  struct lp_type wide_type,
+  LLVMValueRef a,
+  LLVMValueRef b);
+
 LLVMValueRef
 lp_build_mul(struct lp_build_context *bld,
  LLVMValueRef a,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c 
b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
index 1feb415..bd886dc 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -35,6 +35,7 @@
 #include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_pack.h"
 
 #include "lp_bld_blend.h"
 
@@ -86,6 +87,56 @@ lp_build_blend_factor_complementary(unsigned src_factor, 
unsigned dst_factor)
 
 
 /**
+ * Whether this is a inverse blend factor
+ */
+static inline boolean
+is_inverse_factor(unsigned factor)
+{
+   return factor > 0x11;
+}
+
+
+/**
+ * Calculates the (expanded to wider type) multiplication
+ * of 2 normalized numbers.
+ */
+static void
+lp_build_mul_norm_expand(struct lp_build_context *bld,
+ LLVMValueRef a, LLVMValueRef b,
+ LLVMValueRef *resl, LLVMValueRef *resh,
+ boolean signedness_differs)
+{
+   const struct lp_type type = bld->type;
+   struct lp_type wide_type = lp_wider_type(type);
+   struct lp_type wide_type2 = wide_type;
+   struct lp_type type2 = type;
+   LLVMValueRef al, ah, bl, bh;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+   assert(!type.floating && !type.fixed && type.norm);
+
+   if(a == bld->zero || b == bld->zero) {
+  LLVMValueRef zero = LLVMConstNull(lp_build_vec_type(bld->gallivm, 
wide_type));
+  *resl = zero;
+  *resh = zero;
+  return;
+   }
+
+   if (signedness_differs) {
+  type2.sign = !type.sign;
+  wide_type2.sign = !wide_type2.sign;
+   }
+
+   lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+   lp_build_unpack2_native(bld->gallivm, type2, wide_type2, b, &bl, &bh);
+
+   *resl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
+   *resh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
+}
+
+
+/**
  * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
  */
 LLVMValueRef
@@ -192,9 +243,72 @@ lp_build_blend(struct lp_build_context *bld,
if (optimise_only)
   return NULL;
 
-   src_term = lp_build_mul(bld, src, src_factor);
-   dst_term = lp_build_mul(bld, dst, dst_factor);
-   return lp_bui

[Mesa-dev] [PATCH 3/5] r600: use ieee version of rcp

2017-11-09 Thread sroland
From: Roland Scheidegger 

r600 used the clamped version for rcp, whereas both evergreen and cayman
used the ieee version. I don't know why that discrepancy exists (it does so
since day 1) but there does not seem to be a valid reason for this, so make
it consistent. This seems now safer than before the previous commit (using
the dx10 clamp bit).
Note that rsq still uses clamped version (as before even though the table
may have suggested otherwise for evergreen) for r600/eg, but not for cayman.
Will be changed separately for better regression tracking...
---
 src/gallium/drivers/r600/r600_shader.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index e9054c4fbb..2ece2210a6 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -8830,11 +8830,7 @@ static const struct r600_shader_tgsi_instruction 
r600_shader_tgsi_instruction[]
[TGSI_OPCODE_MOV]   = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_LIT]   = { ALU_OP0_NOP, tgsi_lit},
 
-   /* XXX:
-* For state trackers other than OpenGL, we'll want to use
-* _RECIP_IEEE instead.
-*/
-   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_CLAMPED, 
tgsi_trans_srcx_replicate},
+   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_IEEE, 
tgsi_trans_srcx_replicate},
 
[TGSI_OPCODE_RSQ]   = { ALU_OP0_NOP, tgsi_rsq},
[TGSI_OPCODE_EXP]   = { ALU_OP0_NOP, tgsi_exp},
@@ -9035,7 +9031,7 @@ static const struct r600_shader_tgsi_instruction 
eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_MOV]   = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_LIT]   = { ALU_OP0_NOP, tgsi_lit},
[TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_IEEE, 
tgsi_trans_srcx_replicate},
-   [TGSI_OPCODE_RSQ]   = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
+   [TGSI_OPCODE_RSQ]   = { ALU_OP0_NOP, tgsi_rsq},
[TGSI_OPCODE_EXP]   = { ALU_OP0_NOP, tgsi_exp},
[TGSI_OPCODE_LOG]   = { ALU_OP0_NOP, tgsi_log},
[TGSI_OPCODE_MUL]   = { ALU_OP2_MUL_IEEE, tgsi_op2},
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/5] r600: use ieee version of rsq

2017-11-09 Thread sroland
From: Roland Scheidegger 

Both r600 and evergreen used the clamped version, whereas cayman used the
ieee one. I don't think there's a valid reason for this discrepancy, so let's
switch to the ieee version for r600 and evergreen too, since we generally
want to stick to ieee arithmetic.
With this, behavior for both rcp and rsq should now be the same for all of
r600, eg, cm, all using ieee versions (albeit note rsq retains the abs
behavior for everybody, which may not be a good idea ultimately).
---
 src/gallium/drivers/r600/r600_shader.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 2ece2210a6..3f42654d13 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -4796,11 +4796,7 @@ static int tgsi_rsq(struct r600_shader_ctx *ctx)
 
memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 
-   /* XXX:
-* For state trackers other than OpenGL, we'll want to use
-* _RECIPSQRT_IEEE instead.
-*/
-   alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
+   alu.op = ALU_OP1_RECIPSQRT_IEEE;
 
for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/5] r600: use min_dx10/max_dx10 instead of min/max

2017-11-09 Thread sroland
From: Roland Scheidegger 

I believe this is the safe thing to do, especially ever since the driver
actually generates NaNs for muls too.
The ISA docs are not very helpful here, however the dx10 versions will pick
a non-nan result over a NaN one (this is also the ieee754 behavior), whereas
the non-dx10 ones will pick the NaN (verified by newly changed piglit
isinf-and-isnan test).
Other "modern" drivers will most likely do the same.
This was shown to make some difference for bug 103544, albeit it is not
required to fix it.
---
 src/gallium/drivers/r600/r600_shader.c  | 13 +++--
 src/gallium/drivers/r600/sb/sb_expr.cpp |  2 ++
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 188fbc9d47..e9054c4fbb 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -8844,8 +8844,9 @@ static const struct r600_shader_tgsi_instruction 
r600_shader_tgsi_instruction[]
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
@@ -9042,8 +9043,8 @@ static const struct r600_shader_tgsi_instruction 
eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
@@ -9265,8 +9266,8 @@ static const struct r600_shader_tgsi_instruction 
cm_shader_tgsi_instruction[] =
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp 
b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 3dd3a4815b..7a5d62c8e8 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -753,7 +753,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) {
n.bc.src[0].abs == n.bc.src[1].abs) {
switch (n.bc.op) {
case ALU_OP2_MIN: // (MIN x, x) => (MOV x)
+   case ALU_OP2_MIN_DX10:
case ALU_OP2_MAX:
+   case ALU_OP2_MAX_DX10:
convert_to_mov(n, v0, n.bc.src[0].neg, 
n.bc.src[0].abs);
return fold_alu_op1(n);
case ALU_OP2_ADD:  // (ADD x, x) => (MUL x, 2)
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/5] r600: set the number type correctly for float rts in cb setup

2017-11-09 Thread sroland
From: Roland Scheidegger 

Float rts were always set as unorm instead of float.
Not sure of the consequences, but at least it looks like the blend clamp
would have been enabled, which is against the rules (only eg really bothered
to even attempt to specify this correctly, r600 always used clamp anyway).
Albeit r600 (not r700) setup still looks bugged to me due to never setting
BLEND_FLOAT32 which must be set according to docs...
Not sure if the hw really cares, no piglit change (on eg/juniper).
---
 src/gallium/drivers/r600/evergreen_state.c |  7 ++-
 src/gallium/drivers/r600/r600_state.c  | 10 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index ef323bf4f6..e724cb157f 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1042,7 +1042,7 @@ static void evergreen_set_color_surface_buffer(struct 
r600_context *rctx,
}
}
ntype = V_028C70_NUMBER_UNORM;
-   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
ntype = V_028C70_NUMBER_SRGB;
else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
if (desc->channel[i].normalized)
@@ -1054,7 +1054,10 @@ static void evergreen_set_color_surface_buffer(struct 
r600_context *rctx,
ntype = V_028C70_NUMBER_UNORM;
else if (desc->channel[i].pure_integer)
ntype = V_028C70_NUMBER_UINT;
+   } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+   ntype = V_028C70_NUMBER_FLOAT;
}
+
pitch = (pitch / 8) - 1;
color->pitch = S_028C64_PITCH_TILE_MAX(pitch);
 
@@ -1180,6 +1183,8 @@ static void evergreen_set_color_surface_common(struct 
r600_context *rctx,
ntype = V_028C70_NUMBER_UNORM;
else if (desc->channel[i].pure_integer)
ntype = V_028C70_NUMBER_UINT;
+   } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+   ntype = V_028C70_NUMBER_FLOAT;
}
 
if (R600_BIG_ENDIAN)
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index db3d6db70b..f024987a30 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -817,7 +817,7 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
unsigned offset;
const struct util_format_description *desc;
int i;
-   bool blend_bypass = 0, blend_clamp = 1, do_endian_swap = FALSE;
+   bool blend_bypass = 0, blend_clamp = 0, do_endian_swap = FALSE;
 
if (rtex->db_compatible && !r600_can_sample_zs(rtex, false)) {
r600_init_flushed_depth_texture(&rctx->b.b, surf->base.texture, 
NULL);
@@ -869,6 +869,8 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
ntype = V_0280A0_NUMBER_UNORM;
else if (desc->channel[i].pure_integer)
ntype = V_0280A0_NUMBER_UINT;
+   } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+   ntype = V_0280A0_NUMBER_FLOAT;
}
 
if (R600_BIG_ENDIAN)
@@ -883,6 +885,11 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
 
endian = r600_colorformat_endian_swap(format, do_endian_swap);
 
+   /* blend clamp should be set for all NORM/SRGB types */
+   if (ntype == V_0280A0_NUMBER_UNORM || ntype == V_0280A0_NUMBER_SNORM ||
+   ntype == V_0280A0_NUMBER_SRGB)
+   blend_clamp = 1;
+
/* set blend bypass according to docs if SINT/UINT or
   8/24 COLOR variants */
if (ntype == V_0280A0_NUMBER_UINT || ntype == V_0280A0_NUMBER_SINT ||
@@ -916,6 +923,7 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
 ntype != V_0280A0_NUMBER_UINT &&
 ntype != V_0280A0_NUMBER_SINT) &&
G_0280A0_BLEND_CLAMP(color_info) &&
+   /* XXX this condition is always true since BLEND_FLOAT32 is 
never set (bug?). */
!G_0280A0_BLEND_FLOAT32(color_info)) {
color_info |= 
S_0280A0_SOURCE_FORMAT(V_0280A0_EXPORT_NORM);
surf->export_16bpc = true;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/5] r600: use DX10_CLAMP bit in shader setup

2017-11-09 Thread sroland
From: Roland Scheidegger 

The docs are not very concise in what this really does, however both
Alex Deucher and Nicolai Hähnle suggested this only really affects instructions
using the CLAMP output modifier, and I've confirmed that with the newly
changed piglit isinf_and_isnan test.
So, with this bit set, if an instruction has the CLAMP modifier bit (which
clamps to [0,1]) set, then NaNs will be converted to zero, otherwise the result
will be NaN.
D3D10 would require this, glsl doesn't have modifiers (with mesa
clamp(x,0,1) would get converted to such a modifier) coupled with a
whatever-floats-your-boat specified NaN behavior, but the clamp behavior
should probably always be used (this also matches what a decomposition into
min(1.0, max(x, 0.0)) would do, if min/max also adhere to the ieee spec of
picking the non-nan result).
Some apps may in fact rely on this, as this prevents misrenderings in
This War of Mine since using ieee muls
(ce7a045feeef8cad155f1c9aa07f166e146e3d00), without having to use clamped
rcp opcode, which would also fix this bug there.
radeonsi also seems to set this bit nowadays if I see that righ (albeit the
llvm amdgpu code comment now says "Make clamp modifier on NaN input returns 0"
instead of "Do not clamp NAN to 0" since it was changed, which also looks
a bit misleading).

v2: set it in all shader stages.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103544
---
 src/gallium/drivers/r600/evergreen_state.c | 6 ++
 src/gallium/drivers/r600/r600_state.c  | 9 +
 2 files changed, 15 insertions(+)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 96eb35a981..ef323bf4f6 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -3235,6 +3235,7 @@ void evergreen_update_ps_state(struct pipe_context *ctx, 
struct r600_pipe_shader
r600_store_value(cb, /* R_028844_SQ_PGM_RESOURCES_PS */
 S_028844_NUM_GPRS(rshader->bc.ngpr) |
 S_028844_PRIME_CACHE_ON_DRAW(1) |
+S_028844_DX10_CLAMP(1) |
 S_028844_STACK_SIZE(rshader->bc.nstack));
/* After that, the NOP relocation packet must be emitted (shader->bo, 
RADEON_USAGE_READ). */
 
@@ -3255,6 +3256,7 @@ void evergreen_update_es_state(struct pipe_context *ctx, 
struct r600_pipe_shader
 
r600_store_context_reg(cb, R_028890_SQ_PGM_RESOURCES_ES,
   S_028890_NUM_GPRS(rshader->bc.ngpr) |
+  S_028890_DX10_CLAMP(1) |
   S_028890_STACK_SIZE(rshader->bc.nstack));
r600_store_context_reg(cb, R_02888C_SQ_PGM_START_ES,
   shader->bo->gpu_address >> 8);
@@ -3317,6 +3319,7 @@ void evergreen_update_gs_state(struct pipe_context *ctx, 
struct r600_pipe_shader
 
r600_store_context_reg(cb, R_028878_SQ_PGM_RESOURCES_GS,
   S_028878_NUM_GPRS(rshader->bc.ngpr) |
+  S_028878_DX10_CLAMP(1) |
   S_028878_STACK_SIZE(rshader->bc.nstack));
r600_store_context_reg(cb, R_028874_SQ_PGM_START_GS,
   shader->bo->gpu_address >> 8);
@@ -3357,6 +3360,7 @@ void evergreen_update_vs_state(struct pipe_context *ctx, 
struct r600_pipe_shader
   S_0286C4_VS_EXPORT_COUNT(nparams - 1));
r600_store_context_reg(cb, R_028860_SQ_PGM_RESOURCES_VS,
   S_028860_NUM_GPRS(rshader->bc.ngpr) |
+  S_028860_DX10_CLAMP(1) |
   S_028860_STACK_SIZE(rshader->bc.nstack));
if (rshader->vs_position_window_space) {
r600_store_context_reg(cb, R_028818_PA_CL_VTE_CNTL,
@@ -3391,6 +3395,7 @@ void evergreen_update_hs_state(struct pipe_context *ctx, 
struct r600_pipe_shader
r600_init_command_buffer(cb, 32);
r600_store_context_reg(cb, R_0288BC_SQ_PGM_RESOURCES_HS,
   S_0288BC_NUM_GPRS(rshader->bc.ngpr) |
+  S_0288BC_DX10_CLAMP(1) |
   S_0288BC_STACK_SIZE(rshader->bc.nstack));
r600_store_context_reg(cb, R_0288B8_SQ_PGM_START_HS,
   shader->bo->gpu_address >> 8);
@@ -3404,6 +3409,7 @@ void evergreen_update_ls_state(struct pipe_context *ctx, 
struct r600_pipe_shader
r600_init_command_buffer(cb, 32);
r600_store_context_reg(cb, R_0288D4_SQ_PGM_RESOURCES_LS,
   S_0288D4_NUM_GPRS(rshader->bc.ngpr) |
+  S_0288D4_DX10_CLAMP(1) |
   S_0288D4_STACK_SIZE(rshader->bc.nstack));
r600_store_context_reg(cb, R_0288D0_SQ_PGM_START_LS,
   shader->bo->gpu_address >> 8);
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r

[Mesa-dev] [PATCH 1/4] r600: use min_dx10/max_dx10 instead of min/max

2017-11-08 Thread sroland
From: Roland Scheidegger 

I believe this is the safe thing to do, especially ever since the driver
actually generates NaNs for muls too.
Albeit since the radeon ISA docs are inaccurate/wrong there, I'm not
entirely sure what the non-dx10 versions do, but (as required by dx10)
the dx10 versions should pick a non-nan source over a nan source.
Other drivers presumably do the same (radeonsi, llvmpipe).
This was shown to make some difference for bug 103544, albeit it is not
required to fix it.
---
 src/gallium/drivers/r600/r600_shader.c  | 12 ++--
 src/gallium/drivers/r600/sb/sb_expr.cpp |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 188fbc9d47..6a755bb3fd 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -8844,8 +8844,8 @@ static const struct r600_shader_tgsi_instruction 
r600_shader_tgsi_instruction[]
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
@@ -9042,8 +9042,8 @@ static const struct r600_shader_tgsi_instruction 
eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
@@ -9265,8 +9265,8 @@ static const struct r600_shader_tgsi_instruction 
cm_shader_tgsi_instruction[] =
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp 
b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 3dd3a4815b..7a5d62c8e8 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -753,7 +753,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) {
n.bc.src[0].abs == n.bc.src[1].abs) {
switch (n.bc.op) {
case ALU_OP2_MIN: // (MIN x, x) => (MOV x)
+   case ALU_OP2_MIN_DX10:
case ALU_OP2_MAX:
+   case ALU_OP2_MAX_DX10:
convert_to_mov(n, v0, n.bc.src[0].neg, 
n.bc.src[0].abs);
return fold_alu_op1(n);
case ALU_OP2_ADD:  // (ADD x, x) => (MUL x, 2)
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] r600: set the number type correctly for float rts in cb setup

2017-11-08 Thread sroland
From: Roland Scheidegger 

Float rts were always set as unorm instead of float.
Not sure of the consequences, but at least it looks like the blend clamp
would have been enabled, which is against the rules (only eg really bothered
to even attempt to specify this correctly, r600 always used clamp anyway).
Albeit r600 (not r700) setup still looks bugged to me due to never setting
BLEND_FLOAT32 which must be set according to docs...
Not sure if the hw really cares, no piglit change.
---
 src/gallium/drivers/r600/evergreen_state.c |  7 ++-
 src/gallium/drivers/r600/r600_state.c  | 10 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index ddd59dc0b5..ba08f38f8c 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1042,7 +1042,7 @@ static void evergreen_set_color_surface_buffer(struct 
r600_context *rctx,
}
}
ntype = V_028C70_NUMBER_UNORM;
-   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
ntype = V_028C70_NUMBER_SRGB;
else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
if (desc->channel[i].normalized)
@@ -1054,7 +1054,10 @@ static void evergreen_set_color_surface_buffer(struct 
r600_context *rctx,
ntype = V_028C70_NUMBER_UNORM;
else if (desc->channel[i].pure_integer)
ntype = V_028C70_NUMBER_UINT;
+   } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+   ntype = V_028C70_NUMBER_FLOAT;
}
+
pitch = (pitch / 8) - 1;
color->pitch = S_028C64_PITCH_TILE_MAX(pitch);
 
@@ -1180,6 +1183,8 @@ static void evergreen_set_color_surface_common(struct 
r600_context *rctx,
ntype = V_028C70_NUMBER_UNORM;
else if (desc->channel[i].pure_integer)
ntype = V_028C70_NUMBER_UINT;
+   } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+   ntype = V_028C70_NUMBER_FLOAT;
}
 
if (R600_BIG_ENDIAN)
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index c0d0b1667a..0bda8d5b3f 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -817,7 +817,7 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
unsigned offset;
const struct util_format_description *desc;
int i;
-   bool blend_bypass = 0, blend_clamp = 1, do_endian_swap = FALSE;
+   bool blend_bypass = 0, blend_clamp = 0, do_endian_swap = FALSE;
 
if (rtex->db_compatible && !r600_can_sample_zs(rtex, false)) {
r600_init_flushed_depth_texture(&rctx->b.b, surf->base.texture, 
NULL);
@@ -869,6 +869,8 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
ntype = V_0280A0_NUMBER_UNORM;
else if (desc->channel[i].pure_integer)
ntype = V_0280A0_NUMBER_UINT;
+   } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+   ntype = V_0280A0_NUMBER_FLOAT;
}
 
if (R600_BIG_ENDIAN)
@@ -883,6 +885,11 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
 
endian = r600_colorformat_endian_swap(format, do_endian_swap);
 
+   /* blend clamp should be set for all NORM/SRGB types */
+   if (ntype == V_0280A0_NUMBER_UNORM || ntype == V_0280A0_NUMBER_SNORM ||
+   ntype == V_0280A0_NUMBER_SRGB)
+   blend_clamp = 1;
+
/* set blend bypass according to docs if SINT/UINT or
   8/24 COLOR variants */
if (ntype == V_0280A0_NUMBER_UINT || ntype == V_0280A0_NUMBER_SINT ||
@@ -916,6 +923,7 @@ static void r600_init_color_surface(struct r600_context 
*rctx,
 ntype != V_0280A0_NUMBER_UINT &&
 ntype != V_0280A0_NUMBER_SINT) &&
G_0280A0_BLEND_CLAMP(color_info) &&
+   /* XXX this condition is always true since BLEND_FLOAT32 is 
never set (bug?). */
!G_0280A0_BLEND_FLOAT32(color_info)) {
color_info |= 
S_0280A0_SOURCE_FORMAT(V_0280A0_EXPORT_NORM);
surf->export_16bpc = true;
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] r600: use mysterious DX10_CLAMP bit in pixel shader setup

2017-11-08 Thread sroland
From: Roland Scheidegger 

I don't know what this bit really does. The docs are somewhere between
misleading and wrong however, as at least the newer ones (that bit exists with
GCN as well) imply all NaNs would get converted to zeros, which is definitely
NOT the case (and that would not be dx10 compliant neither), the r600 ones are
also talking about "dx10 style" vs "dx9 style" clamp, whatever that means for
dx9... Makes no difference at all with piglit's isinf-and-isnan tests, so very
obviously NaNs are still generated just fine.
radeonsi also seems to set this bit nowadays (the llvm amdgpu code comment
now says "Make clamp modifier on NaN input returns 0" instead of "Do not
clamp NAN to 0" since it was changed).

This prevents misrenderings in This War of Mine since using ieee
muls (ce7a045feeef8cad155f1c9aa07f166e146e3d00), without having to use
clamped rcp opcode, which would also fix this.

AMD, it would be really really nice if there would be useful/correct/accurate
information about this bit...
The bit can be set for all shader stages, and maybe it should be set but I
really have no idea...

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103544
---
 src/gallium/drivers/r600/evergreen_state.c | 1 +
 src/gallium/drivers/r600/r600_state.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 96eb35a981..ddd59dc0b5 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -3235,6 +3235,7 @@ void evergreen_update_ps_state(struct pipe_context *ctx, 
struct r600_pipe_shader
r600_store_value(cb, /* R_028844_SQ_PGM_RESOURCES_PS */
 S_028844_NUM_GPRS(rshader->bc.ngpr) |
 S_028844_PRIME_CACHE_ON_DRAW(1) |
+S_028844_DX10_CLAMP(1) |
 S_028844_STACK_SIZE(rshader->bc.nstack));
/* After that, the NOP relocation packet must be emitted (shader->bo, 
RADEON_USAGE_READ). */
 
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index c21e8dabb1..c0d0b1667a 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2548,6 +2548,7 @@ void r600_update_ps_state(struct pipe_context *ctx, 
struct r600_pipe_shader *sha
r600_store_context_reg_seq(cb, R_028850_SQ_PGM_RESOURCES_PS, 2);
r600_store_value(cb, /* R_028850_SQ_PGM_RESOURCES_PS*/
 S_028850_NUM_GPRS(rshader->bc.ngpr) |
+S_028850_DX10_CLAMP(1) |
 S_028850_STACK_SIZE(rshader->bc.nstack) |
 S_028850_UNCACHED_FIRST_INST(ufi));
r600_store_value(cb, exports_ps); /* R_028854_SQ_PGM_EXPORTS_PS */
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] r600: use ieee version of rcp

2017-11-08 Thread sroland
From: Roland Scheidegger 

r600 used the clamped version for rcp, whereas both evergreen and cayman
used the ieee version. I don't know why that discrepancy exists (it does so
since day 1) but there does not seem to be a valid reason for this, so make
it consistent. This seems now safer than before the previous commit (using
the mystery dx10 clamp).
Note that rsq still uses clamped version (as before even though the table
may have suggested otherwise for evergreen) for r600/eg, but not for cayman.
I just don't feel lucky enough to change this (it should also be noted r600
supports sqrt natively, which is always ieee, therefore might not really see
rsqrt with glsl often presumably).
Compile tested only...
---
 src/gallium/drivers/r600/r600_shader.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 6a755bb3fd..628c33787e 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -8830,11 +8830,7 @@ static const struct r600_shader_tgsi_instruction 
r600_shader_tgsi_instruction[]
[TGSI_OPCODE_MOV]   = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_LIT]   = { ALU_OP0_NOP, tgsi_lit},
 
-   /* XXX:
-* For state trackers other than OpenGL, we'll want to use
-* _RECIP_IEEE instead.
-*/
-   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_CLAMPED, 
tgsi_trans_srcx_replicate},
+   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_IEEE, 
tgsi_trans_srcx_replicate},
 
[TGSI_OPCODE_RSQ]   = { ALU_OP0_NOP, tgsi_rsq},
[TGSI_OPCODE_EXP]   = { ALU_OP0_NOP, tgsi_exp},
@@ -9034,7 +9030,7 @@ static const struct r600_shader_tgsi_instruction 
eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_MOV]   = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_LIT]   = { ALU_OP0_NOP, tgsi_lit},
[TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_IEEE, 
tgsi_trans_srcx_replicate},
-   [TGSI_OPCODE_RSQ]   = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
+   [TGSI_OPCODE_RSQ]   = { ALU_OP0_NOP, tgsi_rsq},
[TGSI_OPCODE_EXP]   = { ALU_OP0_NOP, tgsi_exp},
[TGSI_OPCODE_LOG]   = { ALU_OP0_NOP, tgsi_log},
[TGSI_OPCODE_MUL]   = { ALU_OP2_MUL_IEEE, tgsi_op2},
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] r600: use the clamped versions of rcp/rsq for eg/cayman.

2017-11-07 Thread sroland
From: Roland Scheidegger 

r600 already used the clamped versions, but for some reason this was
different to eg/cayman.
(Note that it has been different since essentially forever, 7 years, since
df62338c491f2cace1a48f99de78e83b5edd82fd in particular, which changed
this for r600 but not eg (cayman wasn't supported back then, but probably
copied this from the eg part later). The commit does not mention any reason
why this difference should exist.)
This seems a bit unfortunate, since it would be nice to use ieee arithmetic,
I have no idea what this could potentially break and no idea if it really
makes sense going back to legacy-style rcp/rsq...
This however prevents misrenderings in This War of Mine since using ieee
muls (ce7a045feeef8cad155f1c9aa07f166e146e3d00), albeit strictly speaking
only rcp_clamped is necessary for this. It seems likely the root cause is
some x * rcp(y) calculation where both x and y evaluate to 0. Albeit it
apparently works with other drivers, not sure what's up with that...

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103544
---
 src/gallium/drivers/r600/r600_shader.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 6a755bb3fd..62fc4da901 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -9033,8 +9033,12 @@ static const struct r600_shader_tgsi_instruction 
eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_ARL]   = { ALU_OP0_NOP, tgsi_eg_arl},
[TGSI_OPCODE_MOV]   = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_LIT]   = { ALU_OP0_NOP, tgsi_lit},
-   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_IEEE, 
tgsi_trans_srcx_replicate},
-   [TGSI_OPCODE_RSQ]   = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
+   /* XXX:
+* For state trackers other than OpenGL, we'll want to use
+* _RECIP_IEEE/_RECIPSQRT_IEEE instead.
+*/
+   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_CLAMPED, 
tgsi_trans_srcx_replicate},
+   [TGSI_OPCODE_RSQ]   = { ALU_OP1_RECIPSQRT_CLAMPED, tgsi_rsq},
[TGSI_OPCODE_EXP]   = { ALU_OP0_NOP, tgsi_exp},
[TGSI_OPCODE_LOG]   = { ALU_OP0_NOP, tgsi_log},
[TGSI_OPCODE_MUL]   = { ALU_OP2_MUL_IEEE, tgsi_op2},
@@ -9256,8 +9260,12 @@ static const struct r600_shader_tgsi_instruction 
cm_shader_tgsi_instruction[] =
[TGSI_OPCODE_ARL]   = { ALU_OP0_NOP, tgsi_eg_arl},
[TGSI_OPCODE_MOV]   = { ALU_OP1_MOV, tgsi_op2},
[TGSI_OPCODE_LIT]   = { ALU_OP0_NOP, tgsi_lit},
-   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_IEEE, 
cayman_emit_float_instr},
-   [TGSI_OPCODE_RSQ]   = { ALU_OP1_RECIPSQRT_IEEE, 
cayman_emit_float_instr},
+   /* XXX:
+* For state trackers other than OpenGL, we'll want to use
+* _RECIP_IEEE/_RECIPSQRT_IEEE instead.
+*/
+   [TGSI_OPCODE_RCP]   = { ALU_OP1_RECIP_CLAMPED, 
cayman_emit_float_instr},
+   [TGSI_OPCODE_RSQ]   = { ALU_OP1_RECIPSQRT_CLAMPED, 
cayman_emit_float_instr},
[TGSI_OPCODE_EXP]   = { ALU_OP0_NOP, tgsi_exp},
[TGSI_OPCODE_LOG]   = { ALU_OP0_NOP, tgsi_log},
[TGSI_OPCODE_MUL]   = { ALU_OP2_MUL_IEEE, tgsi_op2},
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] r600: use min_dx10/max_dx10 instead of min/max_dx10

2017-11-07 Thread sroland
From: Roland Scheidegger 

I believe this is the safe thing to do, especially ever since the driver
actually generates NaNs for muls too.
Albeit since the radeon ISA docs are inaccurate/wrong there, I'm not
entirely sure what the non-dx10 versions do, but (as required by dx10)
the dx10 versions should pick a non-nan source over a nan source.
Other drivers presumably do the same (radeonsi, llvmpipe).
This was shown to make some difference for bug 103544, albeit it is not
required to fix it.
---
 src/gallium/drivers/r600/r600_shader.c  | 12 ++--
 src/gallium/drivers/r600/sb/sb_expr.cpp |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 188fbc9d47..6a755bb3fd 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -8844,8 +8844,8 @@ static const struct r600_shader_tgsi_instruction 
r600_shader_tgsi_instruction[]
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
@@ -9042,8 +9042,8 @@ static const struct r600_shader_tgsi_instruction 
eg_shader_tgsi_instruction[] =
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
@@ -9265,8 +9265,8 @@ static const struct r600_shader_tgsi_instruction 
cm_shader_tgsi_instruction[] =
[TGSI_OPCODE_DP3]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DP4]   = { ALU_OP2_DOT4_IEEE, tgsi_dp},
[TGSI_OPCODE_DST]   = { ALU_OP0_NOP, tgsi_opdst},
-   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN, tgsi_op2},
-   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX, tgsi_op2},
+   [TGSI_OPCODE_MIN]   = { ALU_OP2_MIN_DX10, tgsi_op2},
+   [TGSI_OPCODE_MAX]   = { ALU_OP2_MAX_DX10, tgsi_op2},
[TGSI_OPCODE_SLT]   = { ALU_OP2_SETGT, tgsi_op2_swap},
[TGSI_OPCODE_SGE]   = { ALU_OP2_SETGE, tgsi_op2},
[TGSI_OPCODE_MAD]   = { ALU_OP3_MULADD_IEEE, tgsi_op3},
diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp 
b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 3dd3a4815b..7a5d62c8e8 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -753,7 +753,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) {
n.bc.src[0].abs == n.bc.src[1].abs) {
switch (n.bc.op) {
case ALU_OP2_MIN: // (MIN x, x) => (MOV x)
+   case ALU_OP2_MIN_DX10:
case ALU_OP2_MAX:
+   case ALU_OP2_MAX_DX10:
convert_to_mov(n, v0, n.bc.src[0].neg, 
n.bc.src[0].abs);
return fold_alu_op1(n);
case ALU_OP2_ADD:  // (ADD x, x) => (MUL x, 2)
-- 
2.12.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] docs: Fix GL_MESA_program_debug enums

2017-11-06 Thread sroland
From: Roland Scheidegger 

13b303ff9265b89bdd9100e32f905e9cdadfad81 added the actual enums but
didn't remove the already existing  ones. (And also duplicated
the "fragment" names instead of using the "vertex" names.)
---
 docs/specs/enums.txt | 26 --
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/docs/specs/enums.txt b/docs/specs/enums.txt
index 4b0485f..7b5709b 100644
--- a/docs/specs/enums.txt
+++ b/docs/specs/enums.txt
@@ -46,14 +46,14 @@ GL_MESA_shader_debug.spec: (obsolete)
 GL_DEBUG_ASSERT_MESA 0x875B
 
 GL_MESA_program_debug: (obsolete)
-   GL_FRAGMENT_PROGRAM_CALLBACK_MESA  0x
-   GL_VERTEX_PROGRAM_CALLBACK_MESA0x
-   GL_FRAGMENT_PROGRAM_POSITION_MESA  0x
-   GL_VERTEX_PROGRAM_POSITION_MESA0x
-   GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA 0x
-   GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA 0x
-   GL_VERTEX_PROGRAM_CALLBACK_FUNC_MESA   0x
-   GL_VERTEX_PROGRAM_CALLBACK_DATA_MESA   0x
+   GL_FRAGMENT_PROGRAM_POSITION_MESA   0x8BB0
+   GL_FRAGMENT_PROGRAM_CALLBACK_MESA   0x8BB1
+   GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA  0x8BB2
+   GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA  0x8BB3
+   GL_VERTEX_PROGRAM_POSITION_MESA 0x8BB4
+   GL_VERTEX_PROGRAM_CALLBACK_MESA 0x8BB5
+   GL_VERTEX_PROGRAM_CALLBACK_FUNC_MESA0x8BB6
+   GL_VERTEX_PROGRAM_CALLBACK_DATA_MESA0x8BB7
 
 GL_MESAX_texture_stack:
GL_TEXTURE_1D_STACK_MESAX0x8759
@@ -63,16 +63,6 @@ GL_MESAX_texture_stack:
GL_TEXTURE_1D_STACK_BINDING_MESAX0x875D
GL_TEXTURE_2D_STACK_BINDING_MESAX0x875E
 
-GL_MESA_program_debug
-   GL_FRAGMENT_PROGRAM_POSITION_MESA   0x8BB0
-   GL_FRAGMENT_PROGRAM_CALLBACK_MESA   0x8BB1
-   GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA  0x8BB2
-   GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA  0x8BB3
-   GL_FRAGMENT_PROGRAM_POSITION_MESA   0x8BB4
-   GL_FRAGMENT_PROGRAM_CALLBACK_MESA   0x8BB5
-   GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA  0x8BB6
-   GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA  0x8BB7
-
 GL_MESA_tile_raster_order
GL_TILE_RASTER_ORDER_FIXED_MESA 0x8BB8
GL_TILE_RASTER_ORDER_INCREASING_X_MESA  0x8BB9
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: don't cull tris with zero aera

2017-10-26 Thread sroland
From: Roland Scheidegger 

Culling tris with zero aera seems like a great idea, but apparently with
fill mode line (and point) we're supposed to draw them, at least some tests
for some other state tracker complained otherwise.
Such tris also always seem to be back facing (not sure if this can be
inferred from anything, since in a mathematical sense it cannot really be
determined), so make sure to account for this when filling in the face
information.
(For solid tris, this is of course unnecessary, drivers will throw the tris
away later in any case.)
---
 src/gallium/auxiliary/draw/draw_pipe_cull.c | 10 ++
 src/gallium/auxiliary/draw/draw_pipe_unfilled.c |  5 ++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c 
b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index 3e8e458..3863485 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -181,6 +181,16 @@ static void cull_tri( struct draw_stage *stage,
 /* triangle is not culled, pass to next stage */
 stage->next->tri( stage->next, header );
  }
+  } else {
+ /*
+  * With zero aera, this is back facing (because the spec says
+  * it's front facing if sign is positive?).
+  * Some apis apparently do not allow us to cull zero aera tris
+  * here, in case of fill mode line (which is rather lame).
+  */
+ if ((PIPE_FACE_BACK & cull_stage(stage)->cull_face) == 0) {
+stage->next->tri( stage->next, header );
+ }
   }
}
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c 
b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index c465c75..f39db0e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -63,10 +63,9 @@ inject_front_face_info(struct draw_stage *stage,
struct prim_header *header)
 {
struct unfilled_stage *unfilled = unfilled_stage(stage);
-   unsigned ccw = header->det < 0.0;
boolean is_front_face = (
-  (stage->draw->rasterizer->front_ccw && ccw) ||
-  (!stage->draw->rasterizer->front_ccw && !ccw));
+  (stage->draw->rasterizer->front_ccw && header->det < 0.0f) ||
+  (!stage->draw->rasterizer->front_ccw && header->det > 0.0f));
int slot = unfilled->face_slot;
unsigned i;
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallium/util: remove some block alignment assertions

2017-10-24 Thread sroland
From: Roland Scheidegger 

These assertions were revisited a couple of times in the past, and they
still weren't quite right.
The problem I was seeing (with some other state tracker) was a copy between
two 512x512 s3tc textures, but from mip level 0 to mip level 8. Therefore,
the destination has only size 2x2 (not a full block), so the box width/height
was only 2, causing the assertion to trigger for src alignment.
As far as I can tell, such a copy is completely legal, and because a correct
assertion would get ridiculously complicated just get rid of it for good.
---
 src/gallium/auxiliary/util/u_surface.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_surface.c 
b/src/gallium/auxiliary/util/u_surface.c
index 5abf966..0a79a25 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -324,16 +324,8 @@ util_resource_copy_region(struct pipe_context *pipe,
/* check that region boxes are block aligned */
assert(src_box.x % src_bw == 0);
assert(src_box.y % src_bh == 0);
-   assert(src_box.width % src_bw == 0 ||
-  src_box.x + src_box.width == u_minify(src->width0, src_level));
-   assert(src_box.height % src_bh == 0 ||
-  src_box.y + src_box.height == u_minify(src->height0, src_level));
assert(dst_box.x % dst_bw == 0);
assert(dst_box.y % dst_bh == 0);
-   assert(dst_box.width % dst_bw == 0 ||
-  dst_box.x + dst_box.width == u_minify(dst->width0, dst_level));
-   assert(dst_box.height % dst_bh == 0 ||
-  dst_box.y + dst_box.height == u_minify(dst->height0, dst_level));
 
/* check that region boxes are not out of bounds */
assert(src_box.x + src_box.width <= u_minify(src->width0, src_level));
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] tgsi: fix tgsi_util_get_inst_usage_mask

2017-10-18 Thread sroland
From: Roland Scheidegger 

The logic for handling shadow coords was completely broken.
Fixes be3ab867bd444594f9d9e0f8e59d305d15769afd.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103265
---
 src/gallium/auxiliary/tgsi/tgsi_util.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c 
b/src/gallium/auxiliary/tgsi/tgsi_util.c
index be8bcdf..cfce590 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -292,17 +292,17 @@ tgsi_util_get_inst_usage_mask(const struct 
tgsi_full_instruction *inst,
case TGSI_OPCODE_TXL2:
case TGSI_OPCODE_LODQ:
case TGSI_OPCODE_TG4: {
-  unsigned dim_layer_shadow =
+  unsigned dim_layer =
  tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
-  unsigned dim_layer, dim;
+  unsigned dim_layer_shadow, dim;
 
-  /* Remove shadow. */
+  /* Add shadow. */
   if (tgsi_is_shadow_target(inst->Texture.Texture)) {
- dim_layer = dim_layer_shadow - 1;
+ dim_layer_shadow = dim_layer + 1;
  if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D)
-dim_layer = 1;
+dim_layer_shadow = 3;
   } else {
- dim_layer = dim_layer_shadow;
+ dim_layer_shadow = dim_layer;
   }
 
   /* Remove layer. */
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


  1   2   3   4   5   6   7   >