[Mesa-dev] [PATCH] util/atomic: Fix p_atomic_add for unlocked and msvc paths
From: Roland Scheidegger Braces mismatch (flagged by CI, untested). Fixes: 385d13f26d2 "util/atomic: Add a _return variant of p_atomic_add" --- src/util/u_atomic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/u_atomic.h b/src/util/u_atomic.h index 9cbc6dd1eaa..1ad87c8feb1 100644 --- a/src/util/u_atomic.h +++ b/src/util/u_atomic.h @@ -89,7 +89,7 @@ #define p_atomic_dec_zero(_v) (p_atomic_dec_return(_v) == 0) #define p_atomic_inc(_v) ((void) p_atomic_inc_return(_v)) #define p_atomic_dec(_v) ((void) p_atomic_dec_return(_v)) -#define p_atomic_add(_v, _i) ((void) p_atomic_add_return((_v), (_i)) +#define p_atomic_add(_v, _i) ((void) p_atomic_add_return((_v), (_i))) #define p_atomic_inc_return(_v) (++(*(_v))) #define p_atomic_dec_return(_v) (--(*(_v))) #define p_atomic_add_return(_v, _i) (*(_v) = *(_v) + (_i)) @@ -146,7 +146,7 @@ (assert(!"should not get here"), 0)) #define p_atomic_add(_v, _i) \ - ((void) p_atomic_add_return((_v), (_i)) + ((void) p_atomic_add_return((_v), (_i))) #define p_atomic_add_return(_v, _i) (\ sizeof *(_v) == sizeof(char)? _InterlockedExchangeAdd8 ((char *) (_v), (_i)) : \ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: Fix saturated signed psub/padd intrinsics on llvm 8
From: Roland Scheidegger LLVM 8 did remove both the signed and unsigned sse2/avx intrinsics in the end, and provide arch-independent llvm intrinsics instead. Fixes a crash when using snorm framebuffers (tested with piglit arb_color_buffer_float-render GL_RGBA8_SNORM -auto). CC: --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 28 - 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 6b7ce9aacf9..53ee00e6767 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -465,7 +465,7 @@ lp_build_add(struct lp_build_context *bld, return bld->one; if (!type.floating && !type.fixed) { - if (LLVM_VERSION_MAJOR >= 9) { + if (LLVM_VERSION_MAJOR >= 8) { char intrin[32]; intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat"; lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); @@ -474,11 +474,9 @@ lp_build_add(struct lp_build_context *bld, if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.b" : NULL; + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.w" : NULL; + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; } else if (util_cpu_caps.has_altivec) { if (type.width == 8) intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; @@ -489,11 +487,9 @@ lp_build_add(struct lp_build_context *bld, if (type.width * type.length == 256) { if (util_cpu_caps.has_avx2) { if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.b" : NULL; + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.w" : NULL; + intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; } } } @@ -793,7 +789,7 @@ lp_build_sub(struct lp_build_context *bld, return bld->zero; if (!type.floating && !type.fixed) { - if (LLVM_VERSION_MAJOR >= 9) { + if (LLVM_VERSION_MAJOR >= 8) { char intrin[32]; intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat"; lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); @@ -802,11 +798,9 @@ lp_build_sub(struct lp_build_context *bld, if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.b" : NULL; + intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.w" : NULL; + intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; } else if (util_cpu_caps.has_altivec) { if (type.width == 8) intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; @@ -817,11 +811,9 @@ lp_build_sub(struct lp_build_context *bld, if (type.width * type.length == 256) { if (util_cpu_caps.has_avx2) { if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.b" : NULL; + intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : - LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.w" : NULL; + intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; } } } -- 2.17.1 ___
[Mesa-dev] [PATCH] llvmpipe: increase max texture size to 2GB
From: Roland Scheidegger The 1GB limit was arbitrary, increase this to 2GB (which is the max possible without code changes). --- src/gallium/drivers/llvmpipe/lp_limits.h | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h index c2808162c78..569179ecdf4 100644 --- a/src/gallium/drivers/llvmpipe/lp_limits.h +++ b/src/gallium/drivers/llvmpipe/lp_limits.h @@ -43,7 +43,11 @@ /** * Max texture sizes */ -#define LP_MAX_TEXTURE_SIZE (1 * 1024 * 1024 * 1024ULL) /* 1GB for now */ +/** + * 2GB is the actual max currently (we always use 32bit offsets, and both + * llvm GEP as well as avx2 gather use signed offsets). + */ +#define LP_MAX_TEXTURE_SIZE (2 * 1024 * 1024 * 1024ULL) #define LP_MAX_TEXTURE_2D_LEVELS 14 /* 8K x 8K for now */ #define LP_MAX_TEXTURE_3D_LEVELS 12 /* 2K x 2K x 2K for now */ #define LP_MAX_TEXTURE_CUBE_LEVELS 14 /* 8K x 8K for now */ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] llvmpipe: fix CALLOC vs. free mismatches
From: Roland Scheidegger Should fix some issues we're seeing. And use REALLOC instead of realloc. --- src/gallium/drivers/llvmpipe/lp_cs_tpool.c | 6 +++--- src/gallium/drivers/llvmpipe/lp_state_cs.c | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_cs_tpool.c b/src/gallium/drivers/llvmpipe/lp_cs_tpool.c index 04495727e1c..6f1b4e2ee55 100644 --- a/src/gallium/drivers/llvmpipe/lp_cs_tpool.c +++ b/src/gallium/drivers/llvmpipe/lp_cs_tpool.c @@ -65,7 +65,7 @@ lp_cs_tpool_worker(void *data) cnd_broadcast(&task->finish); } mtx_unlock(&pool->m); - free(lmem.local_mem_ptr); + FREE(lmem.local_mem_ptr); return 0; } @@ -105,7 +105,7 @@ lp_cs_tpool_destroy(struct lp_cs_tpool *pool) cnd_destroy(&pool->new_work); mtx_destroy(&pool->m); - free(pool); + FREE(pool); } struct lp_cs_tpool_task * @@ -148,6 +148,6 @@ lp_cs_tpool_wait_for_task(struct lp_cs_tpool *pool, mtx_unlock(&pool->m); cnd_destroy(&task->finish); - free(task); + FREE(task); *task_handle = NULL; } diff --git a/src/gallium/drivers/llvmpipe/lp_state_cs.c b/src/gallium/drivers/llvmpipe/lp_state_cs.c index 1645a185cb2..a26cbf4df22 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_cs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_cs.c @@ -1123,8 +1123,9 @@ cs_exec_fn(void *init_data, int iter_idx, struct lp_cs_local_mem *lmem) memset(&thread_data, 0, sizeof(thread_data)); if (lmem->local_size < job_info->req_local_mem) { + lmem->local_mem_ptr = REALLOC(lmem->local_mem_ptr, lmem->local_size, +job_info->req_local_mem); lmem->local_size = job_info->req_local_mem; - lmem->local_mem_ptr = realloc(lmem->local_mem_ptr, lmem->local_size); } thread_data.shared = lmem->local_mem_ptr; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: use fallback code for mul_hi with llvm >= 7.0
From: Roland Scheidegger LLVM 7.0 ditched the pmulu intrinsics. This is only a trivial patch to use the fallback code instead. It'll likely produce atrocious code since the pattern doesn't match what llvm itself uses in its autoupgrade paths, hence the pattern won't be recognized. Should fix https://bugs.freedesktop.org/show_bug.cgi?id=111496 --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index c4931c0b230..f1866c6625f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1169,8 +1169,13 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, * https://llvm.org/bugs/show_bug.cgi?id=30845 * So, whip up our own code, albeit only for length 4 and 8 (which * should be good enough)... +* FIXME: For llvm >= 7.0 we should match the autoupgrade pattern +* (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle +* for signed), which the fallback code does not, without this llvm +* will likely still produce atrocious code. */ - if ((bld->type.length == 4 || bld->type.length == 8) && + if (HAVE_LLVM < 0x0700 && + (bld->type.length == 4 || bld->type.length == 8) && ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || util_cpu_caps.has_sse4_1)) { const char *intrinsic = NULL; -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix issue with AtomicCmpXchg wrapper on llvm 3.5-3.8
From: Roland Scheidegger These versions still need wrapper but already have both success and failure ordering. (Compile tested on llvm 3.7, llvm 3.8.) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=02 --- src/gallium/auxiliary/gallivm/lp_bld_misc.cpp | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 79d10293e80..723c84d57c2 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -822,15 +822,29 @@ static llvm::AtomicOrdering mapFromLLVMOrdering(LLVMAtomicOrdering Ordering) { llvm_unreachable("Invalid LLVMAtomicOrdering value!"); } +#if HAVE_LLVM < 0x305 LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr, LLVMValueRef Cmp, LLVMValueRef New, LLVMAtomicOrdering SuccessOrdering, LLVMAtomicOrdering FailureOrdering, LLVMBool SingleThread) { - /* LLVM 3.8 doesn't have a second ordering and uses old SynchronizationScope enum */ + /* LLVM < 3.5 doesn't have a second ordering and uses old SynchronizationScope enum */ return llvm::wrap(llvm::unwrap(B)->CreateAtomicCmpXchg(llvm::unwrap(Ptr), llvm::unwrap(Cmp), llvm::unwrap(New), mapFromLLVMOrdering(SuccessOrdering), SingleThread ? llvm::SynchronizationScope::SingleThread : llvm::SynchronizationScope::CrossThread)); } +#else +LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr, +LLVMValueRef Cmp, LLVMValueRef New, +LLVMAtomicOrdering SuccessOrdering, +LLVMAtomicOrdering FailureOrdering, +LLVMBool SingleThread) +{ + return llvm::wrap(llvm::unwrap(B)->CreateAtomicCmpXchg(llvm::unwrap(Ptr), llvm::unwrap(Cmp), + llvm::unwrap(New), mapFromLLVMOrdering(SuccessOrdering), + mapFromLLVMOrdering(FailureOrdering), + SingleThread ? llvm::SynchronizationScope::SingleThread : llvm::SynchronizationScope::CrossThread)); +} +#endif #endif -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] scons: fix build with llvm 9.
From: Roland Scheidegger The x86asmprinter component is gone, and things seem to work by just removing it. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110707 --- scons/llvm.py | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scons/llvm.py b/scons/llvm.py index a84ad51d97a..bf9666459c6 100644 --- a/scons/llvm.py +++ b/scons/llvm.py @@ -260,7 +260,10 @@ def generate(env): if '-fno-rtti' in cxxflags: env.Append(CXXFLAGS = ['-fno-rtti']) -components = ['engine', 'mcjit', 'bitwriter', 'x86asmprinter', 'mcdisassembler', 'irreader'] +if llvm_version < distutils.version.LooseVersion('9.0'): + components = ['engine', 'mcjit', 'bitwriter', 'x86asmprinter', 'mcdisassembler', 'irreader'] +else: + components = ['engine', 'mcjit', 'bitwriter', 'mcdisassembler', 'irreader'] env.ParseConfig('%s --libs ' % llvm_config + ' '.join(components)) env.ParseConfig('%s --ldflags' % llvm_config) -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix default cbuf info.
From: Roland Scheidegger The default null_output really needs to be static, otherwise the values we'll eventually get later are doubly random (they are not initialized, and even if they were it's a pointer to a local stack variable). VMware bug 2349556. --- src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c index b4e3c2fbc8..9fc9b8c77e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c @@ -608,7 +608,7 @@ finished: */ for (index = 0; index < PIPE_MAX_COLOR_BUFS; ++index) { - const struct lp_tgsi_channel_info null_output[4]; + static const struct lp_tgsi_channel_info null_output[4]; info->cbuf[index] = null_output; } -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] auxiliary/draw: fix crash with zero-stride draw auto
From: Roland Scheidegger transform feedback draws get the number of vertices from the transform feedback object. In draw, we'll figure this out with the number of bytes written divided by the stride. However, it is apparently possible we end up with a stride of 0 there (not entirely sure it could happen with GL). Probably when nothing was actually ever written (so we don't actually have a stride set). Just avoid the division by zero by setting the count to 0. --- src/gallium/auxiliary/draw/draw_pt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 50286149cd4..eeebca30ce7 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -440,7 +440,8 @@ resolve_draw_info(const struct pipe_draw_info *raw_info, struct draw_so_target *target = (struct draw_so_target *)info->count_from_stream_output; assert(vertex_buffer != NULL); - info->count = target->internal_offset / vertex_buffer->stride; + info->count = vertex_buffer->stride == 0 ? 0 : + target->internal_offset / vertex_buffer->stride; /* Stream output draw can not be indexed */ debug_assert(!info->index_size); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix broken 8-wide s3tc decoding
From: Roland Scheidegger Brian noticed there was an uninitialized var for the 8-wide case and 128 bit blocks, which made it always crash. Likewise, the 64bit block case had another crash bug due to type mismatch. Color decode (used for all s3tc formats) also had a bogus shuffle for this case, leading to decode artifacts. Fix these all up, which makes the code actually work 8-wide. Note that it's still not used - I've verified it works, and the generated assembly does look quite a bit simpler actually (20-30% less instructions for the s3tc decode part with avx2), however in practice it still seems to be sligthly slower for some unknown reason (tested with openarena) on my haswell box, so for now continue to split things into 4-wide vectors before decoding. --- .../auxiliary/gallivm/lp_bld_format_s3tc.c| 33 +-- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c index 9561c349dad..8f6e9bec18a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c @@ -77,24 +77,17 @@ lp_build_uninterleave2_half(struct gallivm_state *gallivm, unsigned lo_hi) { LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH]; - unsigned i, j; + unsigned i; assert(type.length <= LP_MAX_VECTOR_LENGTH); assert(lo_hi < 2); if (type.length * type.width == 256) { - assert(type.length >= 4); - for (i = 0, j = 0; i < type.length; ++i) { - if (i == type.length / 4) { -j = type.length; - } else if (i == type.length / 2) { -j = type.length / 2; - } else if (i == 3 * type.length / 4) { -j = 3 * type.length / 4; - } else { -j += 2; - } - elems[i] = lp_build_const_int32(gallivm, j + lo_hi); + assert(type.length == 8); + assert(type.width == 32); + const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14}; + for (i = 0; i < type.length; ++i) { + elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi); } } else { for (i = 0; i < type.length; ++i) { @@ -277,7 +270,7 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm, } else { LLVMValueRef tmp[4], cc01, cc23; - struct lp_type lp_type32, lp_type64, lp_type32dxt; + struct lp_type lp_type32, lp_type64; memset(&lp_type32, 0, sizeof lp_type32); lp_type32.width = 32; lp_type32.length = length; @@ -309,10 +302,14 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm, lp_build_const_extend_shuffle(gallivm, 2, 4), ""); } if (length == 8) { +struct lp_type lp_type32_4; +memset(&lp_type32_4, 0, sizeof lp_type32_4); +lp_type32_4.width = 32; +lp_type32_4.length = 4; for (i = 0; i < 4; ++i) { tmp[0] = elems[i]; tmp[1] = elems[i+4]; - elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2); + elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2); } } cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0); @@ -811,7 +808,7 @@ s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm, tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi); bit_pos = LLVMBuildAnd(builder, bit_pos, lp_build_const_int_vec(gallivm, type, 0xffdf), ""); - /* Warning: slow shift with per element count */ + /* Warning: slow shift with per element count (without avx2) */ /* * Could do pshufb here as well - just use appropriate 2 bits in bit_pos * to select the right byte with pshufb. Then for the remaining one bit @@ -1640,7 +1637,6 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm, lp_build_const_int_vec(gallivm, type16, 8), ""); alpha = LLVMBuildBitCast(builder, alpha, i64t, ""); shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8); - /* XXX this shuffle broken with LLVM 2.8 */ alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, ""); alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, ""); @@ -2176,6 +2172,9 @@ lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm, return rgba; } + /* +* Could use n > 8 here with avx2, but doesn't seem faster. +*/ if (n > 4) { unsigned count; LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix saturated signed add / sub with llvm 9
From: Roland Scheidegger llvm 8 removed saturated unsigned add / sub x86 sse2 intrinsics, and now llvm 9 removed the signed versions as well - they were proposed for removal earlier, but the pattern to recognize those was very complex, so it wasn't done then. However, instead of these arch-specific intrinsics, there's now arch-independent intrinsics for saturated add / sub, both for signed and unsigned, so use these. They should have only advantages (work with arbitrary vector sizes, optimal code for all archs), although I don't know how well they work in practice for other archs (at least for x86 they do the right thing). Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110454 --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 057c50ed278..02fb81afe51 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -555,6 +555,12 @@ lp_build_add(struct lp_build_context *bld, return bld->one; if (!type.floating && !type.fixed) { + if (HAVE_LLVM >= 0x0900) { +char intrin[32]; +intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat"; +lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); +return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); + } if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { if (type.width == 8) @@ -625,6 +631,7 @@ lp_build_add(struct lp_build_context *bld, * NOTE: cmp/select does sext/trunc of the mask. Does not seem to * interfere with llvm's ability to recognize the pattern but seems * a bit brittle. + * NOTE: llvm 9+ always uses (non arch specific) intrinsic. */ LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res); res = lp_build_select(bld, overflowed, @@ -876,6 +883,12 @@ lp_build_sub(struct lp_build_context *bld, return bld->zero; if (!type.floating && !type.fixed) { + if (HAVE_LLVM >= 0x0900) { +char intrin[32]; +intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat"; +lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); +return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); + } if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { if (type.width == 8) @@ -925,6 +938,7 @@ lp_build_sub(struct lp_build_context *bld, * NOTE: cmp/select does sext/trunc of the mask. Does not seem to * interfere with llvm's ability to recognize the pattern but seems * a bit brittle. + * NOTE: llvm 9+ always uses (non arch specific) intrinsic. */ LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); a = lp_build_select(bld, no_ov, a, b); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix bogus assert in get_indirect_index
From: Roland Scheidegger 0 is a valid value as max index, and the code handles it fine. This isn't commonly seen, as it will only happen with array declarations of size 1. The assert was introduced with a3c898dc97ec5f0e0b93b2ee180bdf8ca3bab14c. Fixes piglit tests/shaders/complex-loop-analysis-bug.shader_test Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110441 --- src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 0f5b3d9acb7..d6af1d84471 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -1108,7 +1108,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld, * larger than the declared size but smaller than the buffer size. */ if (reg_file != TGSI_FILE_CONSTANT) { - assert(index_limit > 0); + assert(index_limit >= 0); max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type, index_limit); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: abort when trying to use non-existing intrinsic
From: Roland Scheidegger Whenever llvm removes an intrinsic (we're using), we're hitting segfaults due to llvm doing calls to address 0 in the jitted code instead. However, Jose figured out we can actually detect this with LLVMGetIntrinsicID(), so use this to abort, so we don't have to wonder what got broken. (Of course, someone still needs to fix the code to no longer use this intrinsic.) --- src/gallium/auxiliary/gallivm/lp_bld_intr.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c index 74ed16f33f0..c9df136b103 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c @@ -241,6 +241,16 @@ lp_build_intrinsic(LLVMBuilderRef builder, function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args); + /* + * If llvm removes an intrinsic we use, we'll hit this abort (rather + * than a call to address zero in the jited code). + */ + if (LLVMGetIntrinsicID(function) == 0) { + printf("llvm (version 0x%x) found no intrinsic for %s, going to crash...\n", +HAVE_LLVM, name); + abort(); + } + if (!set_callsite_attrs) lp_add_func_attributes(function, attr_mask); -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: don't use pavg.b intrinsic on llvm >= 6.0
From: Roland Scheidegger This intrinsic disppeared with llvm 6.0, using it ends up in segfaults (due to llvm issuing call to NULL address in the jited shaders). Add code doing the same thing as the autoupgrade code in llvm so it can be matched and replaced back with a pavgb. While here, also improve lp_test_format, so it tests both with and without cache (as it was, it tested the cache versions only, whereas cache is actually disabled in llvmpipe, and in any case even with it enabled vertex and geometry shaders wouldn't use it). (Although at least for the unorm8 uncached fetch, the code is still quite different to what llvmpipe is using, since that would use unorm8x16 type, whereas the test code is using unorm8x4 type, hence disabling some intrinsic paths.) Fixes: 6f4083143bb8c478ccfcaef034d183d89b471993 --- .../auxiliary/gallivm/lp_bld_format_s3tc.c| 55 +-- src/gallium/drivers/llvmpipe/lp_test_format.c | 91 ++- 2 files changed, 95 insertions(+), 51 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c index 2b143566f24..9561c349dad 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c @@ -457,6 +457,50 @@ color_expand_565_to_(struct gallivm_state *gallivm, } +/* + * Average two byte vectors. (Will always round up.) + */ +static LLVMValueRef +lp_build_pavgb(struct lp_build_context *bld8, + LLVMValueRef v0, + LLVMValueRef v1) +{ + struct gallivm_state *gallivm = bld8->gallivm; + LLVMBuilderRef builder = gallivm->builder; + assert(bld8->type.width == 8); + assert(bld8->type.length == 16 || bld8->type.length == 32); + if (HAVE_LLVM < 0x0600) { + LLVMValueRef intrargs[2]; + char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" : + "llvm.x86.sse2.pavg.b"; + intrargs[0] = v0; + intrargs[1] = v1; + return lp_build_intrinsic(builder, intr_name, +bld8->vec_type, intrargs, 2, 0); + } else { + /* + * Must match llvm's autoupgrade of pavg.b intrinsic to be useful. + * You better hope the backend code manages to detect the pattern, and + * the pattern doesn't change there... + */ + struct lp_type type_ext = bld8->type; + LLVMTypeRef vec_type_ext; + LLVMValueRef res; + LLVMValueRef ext_one; + type_ext.width = 16; + vec_type_ext = lp_build_vec_type(gallivm, type_ext); + ext_one = lp_build_const_vec(gallivm, type_ext, 1); + + v0 = LLVMBuildZExt(builder, v0, vec_type_ext, ""); + v1 = LLVMBuildZExt(builder, v1, vec_type_ext, ""); + res = LLVMBuildAdd(builder, v0, v1, ""); + res = LLVMBuildAdd(builder, res, ext_one, ""); + res = LLVMBuildLShr(builder, res, ext_one, ""); + res = LLVMBuildTrunc(builder, res, bld8->vec_type, ""); + return res; + } +} + /** * Calculate 1/3(v1-v0) + v0 * and 2*1/3(v1-v0) + v0 @@ -602,13 +646,7 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm, */ if ((util_cpu_caps.has_sse2 && n == 4) || (util_cpu_caps.has_avx2 && n == 8)) { - LLVMValueRef intrargs[2]; - char *intr_name = n == 8 ? "llvm.x86.avx2.pavg.b" : -"llvm.x86.sse2.pavg.b"; - intrargs[0] = colors0; - intrargs[1] = colors1; - color2_2 = lp_build_intrinsic(builder, intr_name, - bld8.vec_type, intrargs, 2, 0); + color2_2 = lp_build_pavgb(&bld8, colors0, colors1); color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, ""); } else { @@ -1278,8 +1316,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm, /* same interleave as for lerp23 - correct result in 2nd element */ intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0); intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, ""); - color2_2 = lp_build_intrinsic(builder, "llvm.x86.sse2.pavg.b", - bld8.vec_type, intrargs, 2, 0); + color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]); } else { LLVMValueRef v01, v0, v1, vhalf; diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c index a8aa33d8ae9..885d886cfa9 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_format.c +++ b/src/gallium/drivers/llvmpipe/lp_test_format.c @@ -44,8 +44,6 @@ #include "lp_test.h" -#define USE_TEXTURE_CACHE 1 - static struct lp_build_format_cache *cache_ptr; void @@ -80,7 +78,8 @@ typedef void static LLVMValueRef add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, const struct util_format_description *desc, -struct lp_
[Mesa-dev] [PATCH] gallivm: remove unused float coord wrapping for aos sampling
From: Roland Scheidegger AoS sampling tries to use integers for coord wrapping when possible, as it should be faster. However, for AVX, this was suboptimal, because only floats can use 8x32bit vectors, whereas integers have to be split into 4x32bit vectors. (I believe part of why it was slower was also that at least earlier llvm versions had trouble optimizing it properly, since you can still do simple bit ops with 8x32bit vectors, so a sequence of int add / and / int add / and with such vectors would actually end up doing 128bit inserts/extracts between the operations instead of just doing the cheap 128bit ands.) Hence, a special float coord wrapping path was added to AoS sampling. But this path was actually disabled for a long time already, since we found that just splitting everything before entering the AoS path was still sligthly faster usually, so none of this float coord wrapping code was used anymore (AoS sampling code, when avx2 isn't supported, never sees vectors with length > 4). I thought it might be useful some day again, but I'm not interested anymore in optimizing for very weird instruction sets which have support for 256bit vectors for floats but not for ints, so just drop it. --- .../auxiliary/gallivm/lp_bld_sample_aos.c | 530 +- 1 file changed, 23 insertions(+), 507 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index c46749dbac8..ad3a9e4a4ca 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -131,68 +131,6 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, } -/** - * Build LLVM code for texture coord wrapping, for nearest filtering, - * for float texcoords. - * \param coord the incoming texcoord (s,t or r) - * \param length the texture size along one dimension - * \param offset the texel offset along the coord axis - * \param is_pot if TRUE, length is a power of two - * \param wrap_mode one of PIPE_TEX_WRAP_x - * \param icoord the texcoord after wrapping, as int - */ -static void -lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld, - LLVMValueRef coord, - LLVMValueRef length, - LLVMValueRef offset, - boolean is_pot, - unsigned wrap_mode, - LLVMValueRef *icoord) -{ - struct lp_build_context *coord_bld = &bld->coord_bld; - LLVMValueRef length_minus_one; - - switch(wrap_mode) { - case PIPE_TEX_WRAP_REPEAT: - if (offset) { - /* this is definitely not ideal for POT case */ - offset = lp_build_int_to_float(coord_bld, offset); - offset = lp_build_div(coord_bld, offset, length); - coord = lp_build_add(coord_bld, coord, offset); - } - /* take fraction, unnormalize */ - coord = lp_build_fract_safe(coord_bld, coord); - coord = lp_build_mul(coord_bld, coord, length); - *icoord = lp_build_itrunc(coord_bld, coord); - break; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); - if (bld->static_sampler_state->normalized_coords) { - /* scale coord to length */ - coord = lp_build_mul(coord_bld, coord, length); - } - if (offset) { - offset = lp_build_int_to_float(coord_bld, offset); - coord = lp_build_add(coord_bld, coord, offset); - } - coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, - length_minus_one); - *icoord = lp_build_itrunc(coord_bld, coord); - break; - - case PIPE_TEX_WRAP_CLAMP: - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - case PIPE_TEX_WRAP_MIRROR_REPEAT: - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - default: - assert(0); - } -} - - /** * Helper to compute the first coord and the weight for * linear wrap repeat npot textures @@ -424,129 +362,6 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, } -/** - * Build LLVM code for texture coord wrapping, for linear filtering, - * for float texcoords. - * \param block_length is the length of the pixel block along the - * coordinate axis - * \param coord the incoming texcoord (s,t or r) - * \param length the texture size along one dimension - * \param offset the texel offset along the coord axis - * \param is_pot if TRUE, length is a power of two - * \param wrap_mode one of PIPE_TEX_WRAP_x - * \param coord0 the first texcoord after wrapping, as int - * \param coord1 the second texcoord after wrapping, as int - * \param weight the filter weight as int (0-255) - * \param force_nearest if this coord actually uses nearest filtering - */ -
[Mesa-dev] [PATCH] draw: fix infinite loop in line stippling
From: Roland Scheidegger The calculated length of a line may be infinite, if the coords we get are bogus. This leads to an infinite loop in line stippling. To prevent this test for this explicitly (although technically on at least x86 sse it would actually work without the explicit test, as long as we use the int-converted length value). While here also get rid of some always-true condition. Note this does not actually solve the root cause, which is that the coords we receive are bogus after clipping. This seems a difficult problem to solve. One issue is that due to float arithmetic, clip w may become 0 after clipping if the incoming geometry is "sufficiently degenerate", hence x/y/z ndc (and window) coords will be all inf (or nan). Even with w not quite 0, I believe it's possible we produce values which are actually outside the view volume. (Also, x=y=z=w=0 coords in clipspace would be not considered subject to clipping, and similarly result in all NaN coords.) We just hope for now other draw stages (and rasterizers) can handle those relatively safely (llvmpipe itself should be sort of robust against this, certainly converstion to fixed point will produce garbage, it might fail a couple assertions but should neither hang nor crash otherwise). --- .../auxiliary/draw/draw_pipe_stipple.c| 26 +++ 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index d30572cc61..386b7649e4 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -48,8 +48,8 @@ struct stipple_stage { struct draw_stage stage; float counter; - uint pattern; - uint factor; + ushort pattern; + ushort factor; bool smooth; }; @@ -110,7 +110,7 @@ emit_segment(struct draw_stage *stage, struct prim_header *header, static inline bool -stipple_test(int counter, ushort pattern, int factor) +stipple_test(int counter, ushort pattern, ushort factor) { int b = (counter / factor) & 0xf; return !!((1 << b) & pattern); @@ -136,6 +136,10 @@ stipple_line(struct draw_stage *stage, struct prim_header *header) float length; int i; + int intlength; + + if (header->flags & DRAW_PIPE_RESET_STIPPLE) + stipple->counter = 0; if (stipple->smooth) { float dx = x1 - x0; @@ -147,21 +151,21 @@ stipple_line(struct draw_stage *stage, struct prim_header *header) length = MAX2(dx, dy); } - if (header->flags & DRAW_PIPE_RESET_STIPPLE) - stipple->counter = 0; + if (util_is_inf_or_nan(length)) + intlength = 0; + else + intlength = ceilf(length); /* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table. */ - for (i = 0; i < length; i++) { + for (i = 0; i < intlength; i++) { bool result = stipple_test((int)stipple->counter + i, - (ushort)stipple->pattern, stipple->factor); + stipple->pattern, stipple->factor); if (result != state) { /* changing from "off" to "on" or vice versa */ if (state) { -if (start != i) { - /* finishing an "on" segment */ - emit_segment(stage, header, start / length, i / length); -} +/* finishing an "on" segment */ +emit_segment(stage, header, start / length, i / length); } else { /* starting an "on" segment */ -- 2.17.1 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix improper clamping of vertex index when fetching gs inputs
From: Roland Scheidegger Because we only have one file_max for the (2d) gs input file, the value actually represents the max of attrib and vertex index (although I'm not entirely sure if we really want the max, since the max valid value of the vertex dimension can be easily deduced from the input primitive). Thus in cases where the number of inputs is higher than the number of vertices per prim, we did not properly clamp the vertex index, which would result in out-of-bound fetches, potentially causing segfaults (the segfaults seemed actually difficult to trigger, but valgrind certainly wasn't happy). This might have happened even if the shader did not actually try to fetch bogus vertices, if the fetching happened in non-active conditional clauses. To fix simply use the correct max vertex index value (derived from the input prim type) instead when clamping for this case. --- .../auxiliary/gallivm/lp_bld_tgsi_soa.c | 38 ++- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 83d7dbea9a..0db81b31ad 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -41,6 +41,7 @@ #include "util/u_debug.h" #include "util/u_math.h" #include "util/u_memory.h" +#include "util/u_prim.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_info.h" @@ -1059,7 +1060,8 @@ emit_mask_scatter(struct lp_build_tgsi_soa_context *bld, static LLVMValueRef get_indirect_index(struct lp_build_tgsi_soa_context *bld, unsigned reg_file, unsigned reg_index, - const struct tgsi_ind_register *indirect_reg) + const struct tgsi_ind_register *indirect_reg, + unsigned index_limit) { LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; struct lp_build_context *uint_bld = &bld->bld_base.uint_bld; @@ -1107,8 +1109,7 @@ get_indirect_index(struct lp_build_tgsi_soa_context *bld, */ if (reg_file != TGSI_FILE_CONSTANT) { max_index = lp_build_const_int_vec(bld->bld_base.base.gallivm, - uint_bld->type, - bld->bld_base.info->file_max[reg_file]); + uint_bld->type, index_limit); assert(!uint_bld->type.sign); index = lp_build_min(uint_bld, index, max_index); @@ -1224,7 +1225,8 @@ emit_fetch_constant( indirect_index = get_indirect_index(bld, reg->Register.File, reg->Register.Index, - ®->Indirect); + ®->Indirect, + bld->bld_base.info->file_max[reg->Register.File]); /* All fetches are from the same constant buffer, so * we need to propagate the size to a vector to do a @@ -1341,7 +1343,8 @@ emit_fetch_immediate( indirect_index = get_indirect_index(bld, reg->Register.File, reg->Register.Index, - ®->Indirect); + ®->Indirect, + bld->bld_base.info->file_max[reg->Register.File]); /* * Unlike for other reg classes, adding pixel offsets is unnecessary - * immediates are stored as full vectors (FIXME??? - might be better @@ -1414,7 +1417,8 @@ emit_fetch_input( indirect_index = get_indirect_index(bld, reg->Register.File, reg->Register.Index, - ®->Indirect); + ®->Indirect, + bld->bld_base.info->file_max[reg->Register.File]); index_vec = get_soa_array_offsets(&bld_base->uint_bld, indirect_index, @@ -1502,7 +1506,15 @@ emit_fetch_gs_input( attrib_index = get_indirect_index(bld, reg->Register.File, reg->Register.Index, -®->Indirect); +®->Indirect, + /* +* XXX: this is possibly not quite the right value, since file_max may be +* larger than the max attrib index, due to it being the max of declared +* inputs AND the max vertices per prim (which is 6 for tri adj). +* It should however be safe to use (since we always allocate +* PIPE_MAX_SHADER_INPUTS (80) for it, which is overallocated quite a bit). +*/ +info->file_max[reg->R
[Mesa-dev] [PATCH] gallivm: don't use saturated unsigned add/sub intrinsics for llvm 8.0
From: Roland Scheidegger These have been removed. Unfortunately auto-upgrade doesn't work for jit. (Worse, it seems we don't get a compilation error anymore when compiling the shader, rather llvm will just do a call to a null function in the jitted shaders making it difficult to detect when intrinsics vanish.) Luckily the signed ones are still there, I helped convincing llvm removing them is a bad idea for now, since while the unsigned ones have sort of agreed-upon simplest patterns to replace them with, this is not the case for the signed ones, and they require _significantly_ more complex patterns - to the point that the recognition is IMHO probably unlikely to ever work reliably in practice (due to other optimizations interfering). (Even for the relatively trivial unsigned patterns, llvm already added test cases where recognition doesn't work, unsaturated add followed by saturated add may produce atrocious code.) Nevertheless, it seems there's a serious quest to squash all cpu-specific intrinsics going on, so I'd expect patches to nuke them as well to resurface. Adapt the existing fallback code to match the simple patterns llvm uses and hope for the best. I've verified with lp_test_blend that it does produce the expected saturated assembly instructions. Though our cmp/select build helpers don't use boolean masks, but it doesn't seem to interfere with llvm's ability to recognize the pattern. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106231 --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 87 ++--- 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index e922474ef61..f348833206b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -557,23 +557,27 @@ lp_build_add(struct lp_build_context *bld, if (!type.floating && !type.fixed) { if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { - if (type.width == 8) -intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; - if (type.width == 16) -intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL; } else if (util_cpu_caps.has_altivec) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; } } if (type.width * type.length == 256) { if (util_cpu_caps.has_avx2) { - if (type.width == 8) -intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; - if (type.width == 16) -intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL; } } } @@ -592,8 +596,6 @@ lp_build_add(struct lp_build_context *bld, LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min); - } else { - a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED); } } @@ -612,6 +614,24 @@ lp_build_add(struct lp_build_context *bld, if(bld->type.norm && (bld->type.floating || bld->type.fixed)) res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED); + if (type.norm && !type.floating && !type.
[Mesa-dev] [PATCH] util: return 0 for NaNs in float_to_ubyte
From: Roland Scheidegger d3d10 requires NaNs to get converted to 0 for float->unorm conversions (and float->int etc.). GL spec probably doesn't care in general, but it would make sense to have reasonable behavior in any case imho - the old code was converting negative NaNs to 0, and positive NaNs to 255. (Note that using float comparison isn't actually all that much more effort in any case, at least with sse2 it's just float comparison (ucommiss) instead of int one - I converted the second comparison to float too simply because it saves the probably somewhat expensive transfer of the float from simd to int domain (with sse2 via stack), so the generated code actually has 2 less instructions, although float comparisons are more expensive than int ones.) --- src/gallium/auxiliary/util/u_math.h | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index 79869a1..712305c 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -360,7 +360,6 @@ uif(uint32_t ui) /** * Convert ubyte to float in [0, 1]. - * XXX a 256-entry lookup table would be slightly faster. */ static inline float ubyte_to_float(ubyte ub) @@ -375,16 +374,16 @@ ubyte_to_float(ubyte ub) static inline ubyte float_to_ubyte(float f) { - union fi tmp; - - tmp.f = f; - if (tmp.i < 0) { + /* return 0 for NaN too */ + if (!(f > 0.0f)) { return (ubyte) 0; } - else if (tmp.i >= 0x3f80 /* 1.0f */) { + else if (f >= 1.0f) { return (ubyte) 255; } else { + union fi tmp; + tmp.f = f; tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f; return (ubyte) tmp.i; } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] draw: force draw pipeline if there's more than 65535 vertices
From: Roland Scheidegger The pt emit path can only handle 65535 - the number of vertices is truncated to a ushort, resulting in a too small buffer allocation, which will crash. Forcing the pipeline path looks suboptimal, then again this bug is probably there ever since GS is supported, so it seems it's not happening often. (Note that the vertex_id in the vertex header is 16 bit too, however this is only used by the draw pipeline, and it denotes the emit vertex nr, and that uses vbuf code, which will only emit smaller chunks, so should be fine I think.) Other solutions would be to simply allow 32bit counts for vertex allocation, however 65535 is already larger than this was intended for (the idea being it should be more cache friendly). Or could try to teach the pt emit path to split the emit in smaller chunks (only the non-index path can be affected, since gs output is always linear), but it's a bit tricky (we don't know the primitive boundaries up-front). Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=107295 Cc: --- src/gallium/auxiliary/draw/draw_pt_emit.c | 2 ++ src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c | 10 ++ src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c | 9 + 3 files changed, 21 insertions(+) diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index 6fb630b549..984c76fdf9 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -158,6 +158,7 @@ draw_pt_emit(struct pt_emit *emit, */ render->set_primitive(draw->render, prim_info->prim); + assert(vertex_count <= 65535); render->allocate_vertices(render, (ushort)translate->key.output_stride, (ushort)vertex_count); @@ -229,6 +230,7 @@ draw_pt_emit_linear(struct pt_emit *emit, */ render->set_primitive(draw->render, prim_info->prim); + assert(count <= 65535); if (!render->allocate_vertices(render, (ushort)translate->key.output_stride, (ushort)count)) diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index aa20b918f5..f76e022994 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -299,6 +299,16 @@ fetch_pipeline_generic(struct draw_pt_middle_end *middle, FREE(vert_info->verts); vert_info = &gs_vert_info; prim_info = &gs_prim_info; + + /* + * pt emit can only handle ushort number of vertices (see + * render->allocate_vertices). + * vsplit guarantees there's never more than 4096, however GS can + * easily blow this up (by a factor of 256 (or even 1024) max). + */ + if (vert_info->count > 65535) { + opt |= PT_PIPELINE; + } } else { if (draw_prim_assembler_is_required(draw, prim_info, vert_info)) { draw_prim_assembler_run(draw, prim_info, vert_info, diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 5e0c562256..91c9360cce 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -428,6 +428,15 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle, FREE(vert_info->verts); vert_info = &gs_vert_info; prim_info = &gs_prim_info; + /* + * pt emit can only handle ushort number of vertices (see + * render->allocate_vertices). + * vsplit guarantees there's never more than 4096, however GS can + * easily blow this up (by a factor of 256 (or even 1024) max). + */ + if (vert_info->count > 65535) { + opt |= PT_PIPELINE; + } } else { if (draw_prim_assembler_is_required(draw, prim_info, vert_info)) { draw_prim_assembler_run(draw, prim_info, vert_info, -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nir: fix msvc build
From: Roland Scheidegger Empty initializer braces aren't valid c (it's a gnu extension, and it's valid in c++). Hopefully fixes appveyor / msvc build... Fixes a3150c1d06ae7766c3d3fe3b33432e55c3c7527e --- src/compiler/nir/nir_format_convert.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/nir/nir_format_convert.h b/src/compiler/nir/nir_format_convert.h index 33d90f260c..45532b7488 100644 --- a/src/compiler/nir/nir_format_convert.h +++ b/src/compiler/nir/nir_format_convert.h @@ -121,7 +121,7 @@ nir_format_bitcast_uint_vec_unmasked(nir_builder *b, nir_ssa_def *src, DIV_ROUND_UP(src->num_components * src_bits, dst_bits); assert(dst_components <= 4); - nir_ssa_def *dst_chan[4] = { }; + nir_ssa_def *dst_chan[4] = {0}; if (dst_bits > src_bits) { unsigned shift = 0; unsigned dst_idx = 0; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600/sb: fix crash in fold_alu_op3
From: Roland Scheidegger fold_assoc() called from fold_alu_op3() can lower the number of src to 2, which then leads to an invalid access to n.src[2]->gvalue(). This didn't seem to have caused much harm in the past, but on Fedora 28 it will crash (presumably because -D_GLIBCXX_ASSERTIONS is used, although with libstdc++ 4.8.5 this didn't do anything, -D_GLIBCXX_DEBUG was needed to show the issue). An alternative fix would be to instead call fold_alu_op2() from within fold_assoc() when the number of src is reduced and return always TRUE from fold_assoc() in this case, with the only actual difference being the return value from fold_alu_op3() then. I'm not sure what the return value actually should be in this case (or whether it even can make a difference). https://bugs.freedesktop.org/show_bug.cgi?id=106928 Cc: mesa-sta...@lists.freedesktop.org --- src/gallium/drivers/r600/sb/sb_expr.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 1df78da660..ad798453bc 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -945,6 +945,8 @@ bool expr_handler::fold_alu_op3(alu_node& n) { if (!sh.safe_math && (n.bc.op_ptr->flags & AF_M_ASSOC)) { if (fold_assoc(&n)) return true; + if (n.src.size() < 3) + return fold_alu_op2(n); } value* v0 = n.src[0]->gvalue(); -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] nir/linker: fix msvc build
From: Roland Scheidegger Empty initializer braces aren't valid c (it's a gnu extension, and it's valid in c++). Hopefully fixes appveyor / msvc build... Fixes 6677e131b806b10754adcb7cf3f427a7fcc2aa09 --- src/compiler/glsl/gl_nir_link_atomics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/glsl/gl_nir_link_atomics.c b/src/compiler/glsl/gl_nir_link_atomics.c index da6f5107c9..887ac1b9d0 100644 --- a/src/compiler/glsl/gl_nir_link_atomics.c +++ b/src/compiler/glsl/gl_nir_link_atomics.c @@ -175,7 +175,7 @@ gl_nir_link_assign_atomic_counter_resources(struct gl_context *ctx, struct gl_shader_program *prog) { unsigned num_buffers; - unsigned num_atomic_buffers[MESA_SHADER_STAGES] = { }; + unsigned num_atomic_buffers[MESA_SHADER_STAGES] = {0}; struct active_atomic_buffer *abs = find_active_atomic_counters(ctx, prog, &num_buffers); -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600: fix copy/paste bug for sampleMaskIn workaround
From: Roland Scheidegger The sampleMaskIn workaround (b936f4d1ca0d2ab1e828ff6a6e617f12469687fa) tries to figure out if the shader is running at per-sample frequency, but there's a typo bug so it will only recognize per-sample linar inputs, not per-sample perspective ones. Spotted by Eric Engestrom --- src/gallium/drivers/r600/r600_shader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index c9f2fa6485..c466a48262 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1247,7 +1247,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off tgsi_parse_free(&parse); if (ctx->info.reads_samplemask && - (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) { + (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) { inputs[1].enabled = true; } -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] llvmpipe: improve rasterization discard logic
From: Roland Scheidegger This unifies the explicit rasterization dicard as well as the implicit rasterization disabled logic (which we need for another state tracker), which really should do the exact same thing. We'll now toss out the prims early on in setup with (implicit or explicit) discard, rather than do setup and binning with them, which was entirely pointless. (We should eventually get rid of implicit discard, which should also enable us to discard stuff already in draw, hence draw would be able to skip the pointless clip and fallback stages in this case.) We still need separate logic for only null ps - this is not the same as rasterization discard. But simplify the logic there and don't count primitives simply when there's an empty fs, regardless of depth/stencil tests, which seems perfectly acceptable by d3d10. While here, also fix statistics for primitives if face culling is enabled. No piglit changes. --- src/gallium/drivers/llvmpipe/lp_context.h | 1 - src/gallium/drivers/llvmpipe/lp_jit.c | 1 + src/gallium/drivers/llvmpipe/lp_jit.h | 5 +++ src/gallium/drivers/llvmpipe/lp_rast.c | 12 +++- src/gallium/drivers/llvmpipe/lp_rast_priv.h | 6 src/gallium/drivers/llvmpipe/lp_scene.c | 5 ++- src/gallium/drivers/llvmpipe/lp_scene.h | 10 +++--- src/gallium/drivers/llvmpipe/lp_setup.c | 18 ++- src/gallium/drivers/llvmpipe/lp_setup_line.c| 27 ++-- src/gallium/drivers/llvmpipe/lp_setup_point.c | 21 + src/gallium/drivers/llvmpipe/lp_setup_tri.c | 29 - src/gallium/drivers/llvmpipe/lp_setup_vbuf.c| 2 +- src/gallium/drivers/llvmpipe/lp_state_derived.c | 22 ++--- src/gallium/drivers/llvmpipe/lp_state_fs.c | 41 - src/gallium/drivers/llvmpipe/lp_state_fs.h | 5 --- 15 files changed, 118 insertions(+), 87 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 54d98fd..7a2f253 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -136,7 +136,6 @@ struct llvmpipe_context { struct blitter_context *blitter; unsigned tex_timestamp; - boolean no_rast; /** List of all fragment shader variants */ struct lp_fs_variant_list_item fs_variants_list; diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index a2762f3..e2309f4 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -212,6 +212,7 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp) elem_types[LP_JIT_THREAD_DATA_CACHE] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0); elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc); + elem_types[LP_JIT_THREAD_DATA_INVOCATIONS] = LLVMInt64TypeInContext(lc); elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] = LLVMInt32TypeInContext(lc); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 9db26f2..312d1a1 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -192,6 +192,7 @@ struct lp_jit_thread_data { struct lp_build_format_cache *cache; uint64_t vis_counter; + uint64_t ps_invocations; /* * Non-interpolated rasterizer state passed through to the fragment shader. @@ -205,6 +206,7 @@ struct lp_jit_thread_data enum { LP_JIT_THREAD_DATA_CACHE = 0, LP_JIT_THREAD_DATA_COUNTER, + LP_JIT_THREAD_DATA_INVOCATIONS, LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, LP_JIT_THREAD_DATA_COUNT }; @@ -216,6 +218,9 @@ enum { #define lp_jit_thread_data_counter(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter") +#define lp_jit_thread_data_invocations(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_INVOCATIONS, "invocs") + #define lp_jit_thread_data_raster_state_viewport_index(_gallivm, _ptr) \ lp_build_struct_get(_gallivm, _ptr, \ LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, \ diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 939944a..9d4f9f8 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -107,7 +107,7 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, task->scene->fb.height - y * TILE_SIZE : TILE_SIZE; task->thread_data.vis_counter = 0; - task->ps_invocations = 0; + task->thread_data.ps_invocations = 0; for (i = 0; i < task->scene->fb.nr_cbufs; i++) { if (task->scene->fb.cbufs[i]) { @@ -446,10 +446,6 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, * allocated 4x4 blocks hence need to filter them out here. */ if ((x % TILE_S
[Mesa-dev] [PATCH] draw: get rid of special logic to not emit null tris
From: Roland Scheidegger I've confirmed after 77554d220d6d74b4d913dc37ea3a874e9dc550e4 we no longer need this to pass some tests from another api (as we no longer generate the bogus extra null tris in the first place). --- src/gallium/auxiliary/draw/draw_pipe_clip.c | 38 - 1 file changed, 38 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index 46118b6..2a9c944 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -210,30 +210,6 @@ static void interp(const struct clip_stage *clip, } /** - * Checks whether the specified triangle is empty and if it is returns - * true, otherwise returns false. - * Triangle is considered null/empty if its area is equal to zero. - */ -static inline boolean -is_tri_null(const struct clip_stage *clip, const struct prim_header *header) -{ - const unsigned pos_attr = clip->pos_attr; - float x1 = header->v[1]->data[pos_attr][0] - header->v[0]->data[pos_attr][0]; - float y1 = header->v[1]->data[pos_attr][1] - header->v[0]->data[pos_attr][1]; - float z1 = header->v[1]->data[pos_attr][2] - header->v[0]->data[pos_attr][2]; - - float x2 = header->v[2]->data[pos_attr][0] - header->v[0]->data[pos_attr][0]; - float y2 = header->v[2]->data[pos_attr][1] - header->v[0]->data[pos_attr][1]; - float z2 = header->v[2]->data[pos_attr][2] - header->v[0]->data[pos_attr][2]; - - float vx = y1 * z2 - z1 * y2; - float vy = x1 * z2 - z1 * x2; - float vz = x1 * y2 - y1 * x2; - - return (vx*vx + vy*vy + vz*vz) == 0.f; -} - -/** * Emit a post-clip polygon to the next pipeline stage. The polygon * will be convex and the provoking vertex will always be vertex[0]. */ @@ -247,7 +223,6 @@ static void emit_poly(struct draw_stage *stage, struct prim_header header; unsigned i; ushort edge_first, edge_middle, edge_last; - boolean tri_emitted = FALSE; if (stage->draw->rasterizer->flatshade_first) { edge_first = DRAW_PIPE_EDGE_FLAG_0; @@ -269,7 +244,6 @@ static void emit_poly(struct draw_stage *stage, header.pad = 0; for (i = 2; i < n; i++, header.flags = edge_middle) { - boolean tri_null; /* order the triangle verts to respect the provoking vertex mode */ if (stage->draw->rasterizer->flatshade_first) { header.v[0] = inlist[0]; /* the provoking vertex */ @@ -282,18 +256,6 @@ static void emit_poly(struct draw_stage *stage, header.v[2] = inlist[0]; /* the provoking vertex */ } - tri_null = is_tri_null(clipper, &header); - /* - * If we ever generated a tri (regardless if it had area or not), - * skip all subsequent null tris. - * FIXME: I think this logic was hiding bugs elsewhere. It should - * be possible now to always emit all tris. - */ - if (tri_null && tri_emitted) { - continue; - } - tri_emitted = TRUE; - if (!edgeflags[i-1]) { header.flags &= ~edge_middle; } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: Use alloca_undef with array type instead of alloca_array
From: Roland Scheidegger Use a single allocation of array type instead of the old-style array allocation for the temp and immediate arrays. Probably only makes a difference if they aren't used indirectly (so, if we used them solely because there's too many temps or immediates). In this case the sroa and early-cse passes can sometimes do some optimizations which they otherwise cannot. (As a side note, for the temp reg array, we actually really should use one allocation per array id, not just one for everything.) Note that the instcombine pass would actually promote such allocations to single alloc of array type as well, but it's too late for some artificial shaders we've seen to help (we don't want to run instcombine at the beginning due to its cost, hence would need another sroa/cse pass after instcombine). sroa/early-cse help there because they can actually eliminate all of the huge shader, reducing it to a single const output (don't ask...). (Interestingly, instcombine also removes all the bitcasts we do on that allocation for single-value gathering, and in the end directly indexes into the single vector elements, which according to spec is only semi-valid, but this happens regardless. Another thing instcombine also does is use inbound GEPs, which is probably something we should do manually as well - for indirectly indexed reg files llvm may not be able to figure it out on its own, but we should be able to guarantee all pointers are always inbound. In any case, by the looks of it using single allocation with array type seems to be the right thing to do even for ordinary shaders.) --- src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 61 + 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index e411f90..83d7dbe 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -741,7 +741,8 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask, assert(lp_check_value(bld_store->type, val)); assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind); - assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val)); + assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val) || + LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(dst_ptr))) == LLVMArrayTypeKind); if (exec_mask) { LLVMValueRef res, dst; @@ -852,7 +853,14 @@ get_file_ptr(struct lp_build_tgsi_soa_context *bld, if (bld->indirect_files & (1 << file)) { LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan); - return LLVMBuildGEP(builder, var_of_array, &lindex, 1, ""); + if (LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(var_of_array))) == LLVMArrayTypeKind) { + LLVMValueRef gep[2]; + gep[0] = lp_build_const_int32(bld->bld_base.base.gallivm, 0); + gep[1] = lindex; + return LLVMBuildGEP(builder, var_of_array, gep, 2, ""); + } else { + return LLVMBuildGEP(builder, var_of_array, &lindex, 1, ""); + } } else { assert(index <= bld->bld_base.info->file_max[file]); @@ -1352,21 +1360,20 @@ emit_fetch_immediate( /* Gather values from the immediate register array */ res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2); } else { - LLVMValueRef lindex = lp_build_const_int32(gallivm, -reg->Register.Index * 4 + swizzle); - LLVMValueRef imms_ptr = LLVMBuildGEP(builder, -bld->imms_array, &lindex, 1, ""); + LLVMValueRef gep[2]; + gep[0] = lp_build_const_int32(gallivm, 0); + gep[1] = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle); + LLVMValueRef imms_ptr = LLVMBuildGEP(builder, + bld->imms_array, gep, 2, ""); res = LLVMBuildLoad(builder, imms_ptr, ""); if (tgsi_type_is_64bit(stype)) { -LLVMValueRef lindex1; LLVMValueRef imms_ptr2; LLVMValueRef res2; - -lindex1 = lp_build_const_int32(gallivm, - reg->Register.Index * 4 + swizzle + 1); +gep[1] = lp_build_const_int32(gallivm, + reg->Register.Index * 4 + swizzle + 1); imms_ptr2 = LLVMBuildGEP(builder, - bld->imms_array, &lindex1, 1, ""); + bld->imms_array, gep, 2, ""); res2 = LLVMBuildLoad(builder, imms_ptr2, ""); res = emit_fetch_64bit(bld_base, stype, res, res2); } @@ -2957,13 +2964,14 @@ void lp_emit_immediate_soa( unsigned index = bld->num_immediates; struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
[Mesa-dev] [PATCH] llvmpipe: Fix random number generation for unit tests
From: Roland Scheidegger We were never producing negative numbers for signed types. Also fix only producing half the valid range for uint32, and properly clamp signed values. Because this now also properly tests snorm with actually negative values, need to increase eps for such conversions. I believe these cannot actually be hit in ordinary operation (e.g. if a snorm texture is sampled and output to snorm RT, it will still go through snorm->float and float->snorm conversion), so don't bother to do anything to fix the bad accuracy (might be quite complex). Basically, the issue is for something like snorm16->snorm8 that in the end this will just use a 8 bit arithmetic right shift. But the math behind it says we should actually do a division by 32767 / 127, which is ~258, not 256. So the result can be one bit off (values have too large magnitude), and furthermore, the shift has incorrect rounding (always rounds down). For positive numbers, these errors have different direction, but for negative ones they have the same, hence for some values the error will be 2 bit in the end. Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=106232 --- src/gallium/drivers/llvmpipe/lp_test_conv.c | 8 src/gallium/drivers/llvmpipe/lp_test_main.c | 13 +++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c index 6e58a03..a4f313a 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_conv.c +++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c @@ -211,6 +211,14 @@ test_one(unsigned verbose, assert(src_type.length * num_srcs == dst_type.length * num_dsts); eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type)); + if (dst_type.norm && dst_type.sign && src_type.sign && !src_type.floating) { + /* + * This is quite inaccurate due to shift being used. + * I don't think it's possible to hit such conversions with + * llvmpipe though. + */ + eps *= 2; + } context = LLVMContextCreate(); gallivm = gallivm_create("test_module", context); diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c index 518ca27..5ec0dd3 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_main.c +++ b/src/gallium/drivers/llvmpipe/lp_test_main.c @@ -147,6 +147,7 @@ write_elem(struct lp_type type, void *dst, unsigned index, double value) if(type.sign) { long long lvalue = (long long)value; lvalue = MIN2(lvalue, ((long long)1 << (type.width - 1)) - 1); + lvalue = MAX2(lvalue, -((long long)1 << (type.width - 1))); switch(type.width) { case 8: *((int8_t *)dst + index) = (int8_t)lvalue; @@ -200,16 +201,24 @@ random_elem(struct lp_type type, void *dst, unsigned index) } else { unsigned long long mask; -if (type.fixed) + if (type.fixed) mask = ((unsigned long long)1 << (type.width / 2)) - 1; else if (type.sign) mask = ((unsigned long long)1 << (type.width - 1)) - 1; else mask = ((unsigned long long)1 << type.width) - 1; value += (double)(mask & rand()); + if (!type.fixed && !type.sign && type.width == 32) { +/* + * rand only returns half the possible range + * XXX 64bit values... + */ +if(rand() & 1) + value += (double)0x8000; + } } } - if(!type.sign) + if(type.sign) if(rand() & 1) value = -value; write_elem(type, dst, index, value); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] draw: fix different sign logic when clipping
From: Roland Scheidegger The logic was flawed, since mul(x,y) will be <= 0 (exactly 0) when the sign is the same but both numbers are sufficiently small (if the product is smaller than 2^-128). This could apparently lead to emitting a sufficient amount of additional bogus vertices to overflow the allocated array for them, hitting an assertion (still safe with release builds since we just aborted clipping after the assertion in this case - I'm however unsure if this is now really no longer possible, so that code stays). Not sure if the additional vertices could cause other grief, I didn't see anything wrong even when hitting the assertion. Essentially, both +-0 are treated as positive (the vertex is considered to be inside the clip volume for this plane), so integrate the logic determining different sign into the branch there. --- src/gallium/auxiliary/draw/draw_pipe_clip.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index b7a1b5c..6af5c09 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -47,11 +47,6 @@ /** Set to 1 to enable printing of coords before/after clipping */ #define DEBUG_CLIP 0 - -#ifndef DIFFERENT_SIGNS -#define DIFFERENT_SIGNS(x, y) ((x) * (y) <= 0.0F && (x) - (y) != 0.0F) -#endif - #define MAX_CLIPPED_VERTICES ((2 * (6 + PIPE_MAX_CLIP_PLANES))+1) @@ -479,6 +474,7 @@ do_clip_tri(struct draw_stage *stage, for (i = 1; i <= n; i++) { struct vertex_header *vert = inlist[i]; boolean *edge = &inEdges[i]; + boolean different_sign; float dp = getclipdist(clipper, vert, plane_idx); @@ -491,9 +487,12 @@ do_clip_tri(struct draw_stage *stage, return; outEdges[outcount] = *edge_prev; outlist[outcount++] = vert_prev; +different_sign = dp < 0.0f; + } else { +different_sign = !(dp < 0.0f); } - if (DIFFERENT_SIGNS(dp, dp_prev)) { + if (different_sign) { struct vertex_header *new_vert; boolean *new_edge; @@ -511,7 +510,7 @@ do_clip_tri(struct draw_stage *stage, if (dp < 0.0f) { /* Going out of bounds. Avoid division by zero as we -* know dp != dp_prev from DIFFERENT_SIGNS, above. +* know dp != dp_prev from different_sign, above. */ float t = dp / (dp - dp_prev); interp( clipper, new_vert, t, vert, vert_prev, viewport_index ); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] draw: simplify clip null tri logic
From: Roland Scheidegger Simplifies the logic when to emit null tris (albeit the reasons why we have to do this remain unclear). This is strictly just logic simplification, the behavior doesn't change at all. --- src/gallium/auxiliary/draw/draw_pipe_clip.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index 4cfa54b..b7a1b5c 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -253,7 +253,7 @@ static void emit_poly(struct draw_stage *stage, unsigned i; ushort edge_first, edge_middle, edge_last; boolean last_tri_was_null = FALSE; - boolean tri_was_not_null = FALSE; + boolean tri_emitted = FALSE; if (stage->draw->rasterizer->flatshade_first) { edge_first = DRAW_PIPE_EDGE_FLAG_0; @@ -289,17 +289,16 @@ static void emit_poly(struct draw_stage *stage, } tri_null = is_tri_null(clipper, &header); - /* If we generated a triangle with an area, aka. non-null triangle, - * or if the previous triangle was also null then skip all subsequent - * null triangles */ - if ((tri_was_not_null && tri_null) || (last_tri_was_null && tri_null)) { - last_tri_was_null = tri_null; + /* + * If we ever generated a tri (regardless if it had area or not), + * skip all subsequent null tris. + * FIXME: it is unclear why we always have to emit at least one + * tri. Maybe this is hiding bugs elsewhere. + */ + if (tri_null && tri_emitted) { continue; } - last_tri_was_null = tri_null; - if (!tri_null) { - tri_was_not_null = TRUE; - } + tri_emitted = TRUE; if (!edgeflags[i-1]) { header.flags &= ~edge_middle; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/4] gallivm: dump bitcode before optimization
From: Roland Scheidegger If we dump the bitcode for off-line debug purposes, we really want the pre-optimized bitcode, otherwise it's useless in identifying problems with IR optimization (if you have a shader which takes an hour to do IR optimization, it's also nice you don't have to wait that hour...). Also, print out the function passes for opt which correspond to what was used for jit compilation (and also the opt level for codegen). Using opt/llc this way should then pretty much mimic what was done for jit. (When specifying something like -time-passes -debug-pass=[Structure|Arguments] (for either opt or llc) that also gives very useful information in which passes all the time was spent, and which passes are really run along with the order - llvm will add passes due to dependencies on its own, and of course -O2 for llc comes with a ~100 pass list.) --- src/gallium/auxiliary/gallivm/lp_bld_init.c | 35 + 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index d0afff1..41d828c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -142,6 +142,10 @@ create_pass_manager(struct gallivm_state *gallivm) * TODO: Evaluate passes some more - keeping in mind * both quality of generated code and compile times. */ + /* + * NOTE: if you change this, don't forget to change the output + * with GALLIVM_DEBUG_DUMP_BC in gallivm_compile_module. + */ LLVMAddScalarReplAggregatesPass(gallivm->passmgr); LLVMAddEarlyCSEPass(gallivm->passmgr); LLVMAddCFGSimplificationPass(gallivm->passmgr); @@ -151,7 +155,7 @@ create_pass_manager(struct gallivm_state *gallivm) * due to licm implying lcssa (since llvm 3.5), which can take forever. * Even for sane shaders, the cost of licm is rather high (and not just * due to lcssa, licm itself too), though mostly only in cases when it - * can actually move things, so having to disable it is a pity. + * can actually move things, so having to disable it is a pity. * LLVMAddLICMPass(gallivm->passmgr); */ LLVMAddReassociatePass(gallivm->passmgr); @@ -597,6 +601,22 @@ gallivm_compile_module(struct gallivm_state *gallivm) gallivm->builder = NULL; } + /* Dump bitcode to a file */ + if (gallivm_debug & GALLIVM_DEBUG_DUMP_BC) { + char filename[256]; + assert(gallivm->module_name); + util_snprintf(filename, sizeof(filename), "ir_%s.bc", gallivm->module_name); + LLVMWriteBitcodeToFile(gallivm->module, filename); + debug_printf("%s written\n", filename); + debug_printf("Invoke as \"opt %s %s | llc -O%d %s%s\"\n", + gallivm_debug & GALLIVM_DEBUG_NO_OPT ? "-mem2reg" : + "-sroa -early-cse -simplifycfg -reassociate " + "-mem2reg -constprop -instcombine -gvn", + filename, gallivm_debug & GALLIVM_DEBUG_NO_OPT ? 0 : 2, + (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "", + "[-mattr=<-mattr option(s)>]"); + } + if (gallivm_debug & GALLIVM_DEBUG_PERF) time_begin = os_time_get(); @@ -630,19 +650,6 @@ gallivm_compile_module(struct gallivm_state *gallivm) gallivm->module_name, time_msec); } - /* Dump byte code to a file */ - if (gallivm_debug & GALLIVM_DEBUG_DUMP_BC) { - char filename[256]; - assert(gallivm->module_name); - util_snprintf(filename, sizeof(filename), "ir_%s.bc", gallivm->module_name); - LLVMWriteBitcodeToFile(gallivm->module, filename); - debug_printf("%s written\n", filename); - debug_printf("Invoke as \"llc %s%s -o - %s\"\n", - (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "", - "[-mattr=<-mattr option(s)>]", - filename); - } - if (use_mcjit) { /* Setting the module's DataLayout to an empty string will cause the * ExecutionEngine to copy to the DataLayout string from its target -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/4] gallivm: (trivial) do division by 1000 with int64
From: Roland Scheidegger Conversion to int can otherwise overflow if compile times are over ~71min. (Yes this can happen...) --- src/gallium/auxiliary/gallivm/lp_bld_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index abca624..d0afff1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -624,7 +624,7 @@ gallivm_compile_module(struct gallivm_state *gallivm) if (gallivm_debug & GALLIVM_DEBUG_PERF) { int64_t time_end = os_time_get(); - int time_msec = (int)(time_end - time_begin) / 1000; + int time_msec = (int)((time_end - time_begin) / 1000); assert(gallivm->module_name); debug_printf("optimizing module %s took %d msec\n", gallivm->module_name, time_msec); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/4] gallivm: remove LICM pass
From: Roland Scheidegger LICM is simply too expensive, even though it presumably can help quite a bit in some cases. It was definitely cheaper in llvm 3.3, though as far as I can tell with llvm 3.3 it failed to do anything in most cases. early-cse also actually seems to cause licm to be able to move things when it previously couldn't, which causes noticeable compile time increases. There's more loop passes in llvm, but I'm not sure which ones are helpful, and I couldn't find anything which would roughly do what the old licm in llvm 3.3 did, so ditch it. --- src/gallium/auxiliary/gallivm/lp_bld_init.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 05a74a0..abca624 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -145,7 +145,15 @@ create_pass_manager(struct gallivm_state *gallivm) LLVMAddScalarReplAggregatesPass(gallivm->passmgr); LLVMAddEarlyCSEPass(gallivm->passmgr); LLVMAddCFGSimplificationPass(gallivm->passmgr); - LLVMAddLICMPass(gallivm->passmgr); + /* + * FIXME: LICM is potentially quite useful. However, for some + * rather crazy shaders the compile time can reach _hours_ per shader, + * due to licm implying lcssa (since llvm 3.5), which can take forever. + * Even for sane shaders, the cost of licm is rather high (and not just + * due to lcssa, licm itself too), though mostly only in cases when it + * can actually move things, so having to disable it is a pity. + * LLVMAddLICMPass(gallivm->passmgr); + */ LLVMAddReassociatePass(gallivm->passmgr); LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr); LLVMAddConstantPropagationPass(gallivm->passmgr); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] gallivm: add early cse pass
From: Roland Scheidegger This pass is quite cheap, and can simplify the IR quite a bit for our generated IR. In particular on a variety of shaders I've found the time saved by other passes due to the simplified IR more than makes up for the cost of this pass, and on top of that the end result is actually better. The only downside I've found is this enables the LICM pass to move some things out of the main shader loop (in the case I've seen, instanced vertex fetch (which is constant within the jit shader) plus the derived instructions in the shader) which it couldn't do before for some reason. This would actually be desirable but can increase compile time considerably (licm seems to have considerable cost when it actually can move things out of loops, due to alias analysis). But blaming early cse for this seems inappropriate. (Note that the first two sroa / earlycse passes are similar to what a standard llvm opt -O1/-O2 pipeline would do, albeit this has some more passes even before but I don't think they'd do much for us.) It also in particular helps some crazy shader used for driver verification (don't ask...) a lot (about factor of 6 faster in compile time) (due to simplfiying the ir before LICM is run). While here, also move licm behind simplifycfg. For some shaders there seems to be very significant compile time gains (we've seen a factor of 1 albeit that was a really crazy shader you'd certainly never see in a real app), beause LICM is quite expensive and there's cases where running simplifycfg (along with sroa and early-cse) before licm reduces IR complexity significantly. (I'm not entirely sure if it would make sense to also run it afterwards.) --- src/gallium/auxiliary/gallivm/lp_bld_init.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index ea5489b..05a74a0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -138,13 +138,14 @@ create_pass_manager(struct gallivm_state *gallivm) } if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) { - /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, - * but there are more on SVN. - * TODO: Add more passes. + /* + * TODO: Evaluate passes some more - keeping in mind + * both quality of generated code and compile times. */ LLVMAddScalarReplAggregatesPass(gallivm->passmgr); - LLVMAddLICMPass(gallivm->passmgr); + LLVMAddEarlyCSEPass(gallivm->passmgr); LLVMAddCFGSimplificationPass(gallivm->passmgr); + LLVMAddLICMPass(gallivm->passmgr); LLVMAddReassociatePass(gallivm->passmgr); LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr); LLVMAddConstantPropagationPass(gallivm->passmgr); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600: fix abs for op3 sources
From: Roland Scheidegger If a src was referencing the same temp as the dst, the per-component copy code didn't work. e.g. cndge r0.xy, r0.xx, |r2|, r3 got expanded into mov r12.x, |r2| cndge r0.x, r0.x, r12, r3 mov r12.y, |r2| cndge r0.y, r0.x, r12, r3 hence for the second cndge r0.x was mistakenly the previous cndge result. Fix this by doing all the movs first, so there's no bogus alu.last in between. Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=102905 --- src/gallium/drivers/r600/r600_shader.c | 110 + 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 6b5c42f86d..bd511c76ac 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -7076,33 +7076,42 @@ static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instru } static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx, - unsigned temp, int chan, + unsigned writemask, struct r600_bytecode_alu_src *bc_src, const struct r600_shader_src *shader_src) { struct r600_bytecode_alu alu; - int r; + int i, r; + int lasti = tgsi_last_instruction(writemask); + int temp_reg = 0; - r600_bytecode_src(bc_src, shader_src, chan); + r600_bytecode_src(&bc_src[0], shader_src, 0); + r600_bytecode_src(&bc_src[1], shader_src, 1); + r600_bytecode_src(&bc_src[2], shader_src, 2); + r600_bytecode_src(&bc_src[3], shader_src, 3); - /* op3 operands don't support abs modifier */ if (bc_src->abs) { - assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */ - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - alu.dst.sel = temp; - alu.dst.chan = chan; - alu.dst.write = 1; + temp_reg = r600_get_temp(ctx); - alu.src[0] = *bc_src; - alu.last = true; // sufficient? - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; - - memset(bc_src, 0, sizeof(*bc_src)); - bc_src->sel = temp; - bc_src->chan = chan; + for (i = 0; i < lasti + 1; i++) { + if (!(writemask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.sel = temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.src[0] = bc_src[i]; + if (i == lasti) { + alu.last = 1; + } + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + memset(&bc_src[i], 0, sizeof(*bc_src)); + bc_src[i].sel = temp_reg; + bc_src[i].chan = i; + } } return 0; } @@ -7111,9 +7120,9 @@ static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; + struct r600_bytecode_alu_src srcs[4][4]; int i, j, r; int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); - int temp_regs[4]; unsigned op = ctx->inst_info->op; if (op == ALU_OP3_MULADD_IEEE && @@ -7121,10 +7130,12 @@ static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) op = ALU_OP3_MULADD; for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { - temp_regs[j] = 0; - if (ctx->src[j].abs) - temp_regs[j] = r600_get_temp(ctx); + r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask, + srcs[j], &ctx->src[j]); + if (r) + return r; } + for (i = 0; i < lasti + 1; i++) { if (!(inst->Dst[0].Register.WriteMask & (1 << i))) continue; @@ -7132,9 +7143,7 @@ static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = op; for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { - r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]); - if (r) - return r; + alu.src[j] = srcs[j][i]
[Mesa-dev] [PATCH] u_blit: (trivial) u_blit.h needs to include p_defines.h
From: Roland Scheidegger (For the pipe_tex_filter enum) --- src/gallium/auxiliary/util/u_blit.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h index 085ea63..004ceae 100644 --- a/src/gallium/auxiliary/util/u_blit.h +++ b/src/gallium/auxiliary/util/u_blit.h @@ -31,6 +31,7 @@ #include "pipe/p_compiler.h" +#include "pipe/p_defines.h" #ifdef __cplusplus -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] draw: fix alpha value for very short aa lines
From: Roland Scheidegger The logic would not work correctly for line lengths smaller than 1.0, even a degenerated line with length 0 would still produce a fragment with anyhwere between alpha 0.0 and 0.5. --- src/gallium/auxiliary/draw/draw_pipe_aaline.c | 25 - src/gallium/auxiliary/draw/draw_pipe_stipple.c | 1 - 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index 14a4b2f..66a943a 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -370,7 +370,30 @@ aaline_line(struct draw_stage *stage, struct prim_header *header) float t_l, t_w; uint i; - half_length = 0.5f * sqrtf(dx * dx + dy * dy) + 0.5f; + half_length = 0.5f * sqrtf(dx * dx + dy * dy); + + if (half_length < 0.5f) { + /* + * The logic we use for "normal" sized segments is incorrect + * for very short segments (basically because we only have + * one value to interpolate, not a distance to each endpoint). + * Therefore, we calculate half_length differently, so that for + * original line length (near) 0, we get alpha 0 - otherwise + * max alpha would still be 0.5. This also prevents us from + * artifacts due to degenerated lines (the endpoints being + * identical, which would still receive anywhere from alpha + * 0-0.5 otherwise) (at least the pstipple stage may generate + * such lines due to float inaccuracies if line length is very + * close to a integer). + * Might not be fully accurate neither (because the "strength" of + * the line is going to be determined by how close to the pixel + * center those 1 or 2 fragments are) but it's probably the best + * we can do. + */ + half_length = 2.0f * half_length; + } else { + half_length = half_length + 0.5f; + } t_w = half_width; t_l = 0.5f; diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index 3a44e96..d30572c 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -150,7 +150,6 @@ stipple_line(struct draw_stage *stage, struct prim_header *header) if (header->flags & DRAW_PIPE_RESET_STIPPLE) stipple->counter = 0; - /* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table. */ for (i = 0; i < length; i++) { -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] draw: fix line stippling with aa lines
From: Roland Scheidegger In contrast to non-aa, where stippling is based on either dx or dy (depending on if it's a x or y major line), stippling is based on actual distance with smooth lines, so adjust for this. (It looks like there's some minor artifacts with mesa demos line-sample with wide lines, I think there might be some issues with wide lines and very short line segments (when the original line segment length is below half a pixel) but it may be related to aa lines rather than stippling.) --- src/gallium/auxiliary/draw/draw_pipe_stipple.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index 3a84d6c..8fa8274 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -50,6 +50,7 @@ struct stipple_stage { float counter; uint pattern; uint factor; + bool smooth; }; @@ -136,9 +137,15 @@ stipple_line(struct draw_stage *stage, struct prim_header *header) float dx = x0 > x1 ? x0 - x1 : x1 - x0; float dy = y0 > y1 ? y0 - y1 : y1 - y0; - float length = MAX2(dx, dy); + float length; int i; + if (stipple->smooth) { + length = sqrtf(dx*dx + dy*dy); + } else { + length = MAX2(dx, dy); + } + if (header->flags & DRAW_PIPE_RESET_STIPPLE) stipple->counter = 0; @@ -205,6 +212,7 @@ stipple_first_line(struct draw_stage *stage, stipple->pattern = draw->rasterizer->line_stipple_pattern; stipple->factor = draw->rasterizer->line_stipple_factor + 1; + stipple->smooth = draw->rasterizer->line_smooth; stage->line = stipple_line; stage->line(stage, header); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] draw: simplify (and correct) aaline fallback (v2)
From: Roland Scheidegger The motivation actually was to get rid of the additional tex instruction, since that requires the draw fallback code to intercept all sampler / view calls (even if the fallback is never hit). Basically, the idea is to use coverage of the pixel to calculate the alpha value, and coverage is simply based on the distance to the center of the line (in both line direction, which is useful for wide lines, as well as perpendicular to the line). This is much closer to what hw supporting this natively actually does. It also fixes an issue with line width not quite being correct, as well as endpoints getting stretched too far (in line direction) with wide lines, which is apparent with mesa demo line-width. (For llvmpipe, it would probably make sense to do something like this directly when drawing lines, since rendering two tris is twice as expensive as a line, but it would need some changes with state management.) Since we're no longer relying on mipmapping to get the alpha value, we also don't need to draw 3 rects (6 tris), one is sufficient. There's still issues (as before): - quite sure it's not correct without half_pixel_center, but can't test this with GL. - aaline + line stipple is incorrect (evident with line-width demo). Looking at the spec the stipple pattern should actually be based on distance (not just dx or dy for x/y major lines as without aa). - outputs (other than pos + the one used for line aa) should be reinterpolated since we actually increase line length by half a pixel (but there's no tests which would care). v2: simplify the math (should be equivalent), don't need immediate --- src/gallium/auxiliary/draw/draw_pipe_aaline.c | 504 +- 1 file changed, 100 insertions(+), 404 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index a859dbc..591e2a3 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -1,6 +1,6 @@ /** * - * Copyright 2007 VMware, Inc. + * Copyright 2007-2018 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -26,7 +26,7 @@ **/ /** - * AA line stage: AA lines are converted to texture mapped triangles. + * AA line stage: AA lines are converted triangles (with extra generic) * * Authors: Brian Paul */ @@ -40,7 +40,6 @@ #include "util/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_sampler.h" #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_dump.h" @@ -55,19 +54,6 @@ /** - * Size for the alpha texture used for antialiasing - */ -#define TEXTURE_SIZE_LOG2 5 /* 32 x 32 */ - -/** - * Max texture level for the alpha texture used for antialiasing - * - * Don't use the 1x1 and 2x2 mipmap levels. - */ -#define MAX_TEXTURE_LEVEL (TEXTURE_SIZE_LOG2 - 2) - - -/** * Subclass of pipe_shader_state to carry extra fragment shader info. */ struct aaline_fragment_shader @@ -75,8 +61,7 @@ struct aaline_fragment_shader struct pipe_shader_state state; void *driver_fs; void *aaline_fs; - uint sampler_unit; - int generic_attrib; /**< texcoord/generic used for texture */ + int generic_attrib; /**< generic used for distance */ }; @@ -89,26 +74,16 @@ struct aaline_stage float half_line_width; - /** For AA lines, this is the vertex attrib slot for the new texcoords */ - uint tex_slot; + /** For AA lines, this is the vertex attrib slot for new generic */ + uint coord_slot; /** position, not necessarily output zero */ uint pos_slot; - void *sampler_cso; - struct pipe_resource *texture; - struct pipe_sampler_view *sampler_view; - uint num_samplers; - uint num_sampler_views; - /* * Currently bound state */ struct aaline_fragment_shader *fs; - struct { - void *sampler[PIPE_MAX_SAMPLERS]; - struct pipe_sampler_view *sampler_views[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - } state; /* * Driver interface/override functions @@ -117,15 +92,6 @@ struct aaline_stage const struct pipe_shader_state *); void (*driver_bind_fs_state)(struct pipe_context *, void *); void (*driver_delete_fs_state)(struct pipe_context *, void *); - - void (*driver_bind_sampler_states)(struct pipe_context *, - enum pipe_shader_type, unsigned, - unsigned, void **); - - void (*driver_set_sampler_views)(struct pipe_context *, -enum pipe_shader_type shader, -unsigned start, unsigned count, -struct pipe_sampler_view **); }; @@ -136,41 +102,27 @@ struct aaline_sta
[Mesa-dev] [PATCH] draw: simplify (and correct) aaline fallback
From: Roland Scheidegger The motivation actually was to get rid of the additional tex instruction, since that requires the draw fallback code to intercept all sampler / view calls (even if the fallback is never hit). Basically, the idea is to use coverage of the pixel to calculate the alpha value, and coverage is simply based on the distance to the center of the line (in both line direction, which is useful for wide lines, as well as perpendicular to the line). This is much closer to what hw supporting this natively actually does. It also fixes an issue with line width not quite being correct, as well as endpoints getting stretched too far (in line direction) with wide lines, which is apparent with mesa demo line-width. (For llvmpipe, it would probably make sense to do something like this directly when drawing lines, since rendering two tris is twice as expensive as a line, but it would need some changes with state management.) Since we're no longer relying on mipmapping to get the alpha value, we also don't need to draw 3 rects (6 tris), one is sufficient. There's still issues (as before): - quite sure it's not correct without half_pixel_center, but can't test this with GL. - aaline + line stipple is incorrect (evident with line-width demo). Looking at the spec the stipple pattern should actually be based on distance (not just dx or dy for x/y major lines as without aa). - outputs (other than pos + the one used for line aa) should be reinterpolated since we actually increase line length by half a pixel (but there's no tests which would care). --- src/gallium/auxiliary/draw/draw_pipe_aaline.c | 532 +++--- 1 file changed, 131 insertions(+), 401 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index a859dbc..b490a50 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -1,6 +1,6 @@ /** * - * Copyright 2007 VMware, Inc. + * Copyright 2007-2018 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -26,7 +26,7 @@ **/ /** - * AA line stage: AA lines are converted to texture mapped triangles. + * AA line stage: AA lines are converted triangles (with extra generic) * * Authors: Brian Paul */ @@ -40,7 +40,6 @@ #include "util/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_sampler.h" #include "tgsi/tgsi_transform.h" #include "tgsi/tgsi_dump.h" @@ -55,19 +54,6 @@ /** - * Size for the alpha texture used for antialiasing - */ -#define TEXTURE_SIZE_LOG2 5 /* 32 x 32 */ - -/** - * Max texture level for the alpha texture used for antialiasing - * - * Don't use the 1x1 and 2x2 mipmap levels. - */ -#define MAX_TEXTURE_LEVEL (TEXTURE_SIZE_LOG2 - 2) - - -/** * Subclass of pipe_shader_state to carry extra fragment shader info. */ struct aaline_fragment_shader @@ -75,8 +61,7 @@ struct aaline_fragment_shader struct pipe_shader_state state; void *driver_fs; void *aaline_fs; - uint sampler_unit; - int generic_attrib; /**< texcoord/generic used for texture */ + int generic_attrib; /**< generic used for distance */ }; @@ -89,26 +74,16 @@ struct aaline_stage float half_line_width; - /** For AA lines, this is the vertex attrib slot for the new texcoords */ - uint tex_slot; + /** For AA lines, this is the vertex attrib slot for new generic */ + uint coord_slot; /** position, not necessarily output zero */ uint pos_slot; - void *sampler_cso; - struct pipe_resource *texture; - struct pipe_sampler_view *sampler_view; - uint num_samplers; - uint num_sampler_views; - /* * Currently bound state */ struct aaline_fragment_shader *fs; - struct { - void *sampler[PIPE_MAX_SAMPLERS]; - struct pipe_sampler_view *sampler_views[PIPE_MAX_SHADER_SAMPLER_VIEWS]; - } state; /* * Driver interface/override functions @@ -117,15 +92,6 @@ struct aaline_stage const struct pipe_shader_state *); void (*driver_bind_fs_state)(struct pipe_context *, void *); void (*driver_delete_fs_state)(struct pipe_context *, void *); - - void (*driver_bind_sampler_states)(struct pipe_context *, - enum pipe_shader_type, unsigned, - unsigned, void **); - - void (*driver_set_sampler_views)(struct pipe_context *, -enum pipe_shader_type shader, -unsigned start, unsigned count, -struct pipe_sampler_view **); }; @@ -136,41 +102,38 @@ struct aaline_stage */ struct aa_transform_context { struct tgsi_transform_con
[Mesa-dev] [PATCH] tgsi/scan: use wrap-around shift behavior explicitly for file_mask
From: Roland Scheidegger The comment said it will only represent the lowest 32 regs. This was not entirely true in practice, since at least on x86 you'll get masked shifts (unless the compiler could recognize it already and toss it out). It turns out this actually works out alright (presumably noone uses it for temp regs) when increasing max sampler views, so make that behavior explicit. Albeit it feels a bit hacky (but in any case, explicit behavior there is better than undefined behavior). --- src/gallium/auxiliary/tgsi/tgsi_scan.c | 7 +-- src/gallium/drivers/llvmpipe/lp_state_fs.c | 7 ++- src/gallium/drivers/swr/swr_shader.cpp | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index c35eff2..0d229c9 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -585,8 +585,11 @@ scan_declaration(struct tgsi_shader_info *info, int buffer; unsigned index, target, type; - /* only first 32 regs will appear in this bitfield */ - info->file_mask[file] |= (1 << reg); + /* + * only first 32 regs will appear in this bitfield, if larger + * bits will wrap around. + */ + info->file_mask[file] |= (1 << (reg & 31)); info->file_count[file]++; info->file_max[file] = MAX2(info->file_max[file], (int)reg); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 603fd84..48c004c 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -3323,7 +3323,12 @@ make_variant_key(struct llvmpipe_context *lp, if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; for(i = 0; i < key->nr_sampler_views; ++i) { - if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { + /* + * Note sview may exceed what's representable by file_mask. + * This will still work, the only downside is that not actually + * used views may be included in the shader key. + */ + if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << (i & 31))) { lp_sampler_static_texture_state(&key->state[i].texture_state, lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); } diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index e5fb679..fa1c0b8 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -98,7 +98,7 @@ swr_generate_sampler_key(const struct lp_tgsi_info &info, key.nr_sampler_views = info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; for (unsigned i = 0; i < key.nr_sampler_views; i++) { - if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { + if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << (i & 31))) { const struct pipe_sampler_view *view = ctx->sampler_views[shader_type][i]; lp_sampler_static_texture_state( -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] cso: don't cycle through PIPE_MAX_SHADER_SAMPLER_VIEWS on context destroy
From: Roland Scheidegger There's no point, we know the highest non-null one. --- src/gallium/auxiliary/cso_cache/cso_context.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index 1b5d4b5..3fa57f1 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -407,8 +407,10 @@ void cso_destroy_context( struct cso_context *ctx ) ctx->pipe->set_stream_output_targets(ctx->pipe, 0, NULL, NULL); } - for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) { + for (i = 0; i < ctx->nr_fragment_views; i++) { pipe_sampler_view_reference(&ctx->fragment_views[i], NULL); + } + for (i = 0; i < ctx->nr_fragment_views_saved; i++) { pipe_sampler_view_reference(&ctx->fragment_views_saved[i], NULL); } -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] softpipe: don't iterate through PIPE_MAX_SHADER_SAMPLER_VIEWS
From: Roland Scheidegger We were setting view to NULL if the iteration was larger than i. But in fact if the view is NULL the code did nothing anyway... --- src/gallium/drivers/softpipe/sp_state_sampler.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c index c10fd91..751eb76 100644 --- a/src/gallium/drivers/softpipe/sp_state_sampler.c +++ b/src/gallium/drivers/softpipe/sp_state_sampler.c @@ -181,8 +181,8 @@ prepare_shader_sampling( if (!num) return; - for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) { - struct pipe_sampler_view *view = i < num ? views[i] : NULL; + for (i = 0; i < num; i++) { + struct pipe_sampler_view *view = views[i]; if (view) { struct pipe_resource *tex = view->texture; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] RFC: gallium: increase PIPE_MAX_SHADER_SAMPLER_VIEWS to 128
From: Roland Scheidegger Some state trackers require 128. (There are no plans to increase PIPE_MAX_SAMPLERS too, since with gl state tracker it's unlikely more than 32 will be needed, if you need more use bindless.) --- src/gallium/include/pipe/p_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index 2b56d60..cddb3b4 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -64,7 +64,7 @@ extern "C" { #define PIPE_MAX_SAMPLERS 32 #define PIPE_MAX_SHADER_INPUTS80 /* 32 GENERIC + 32 PATCH + 16 others */ #define PIPE_MAX_SHADER_OUTPUTS 80 /* 32 GENERIC + 32 PATCH + 16 others */ -#define PIPE_MAX_SHADER_SAMPLER_VIEWS 32 +#define PIPE_MAX_SHADER_SAMPLER_VIEWS 128 #define PIPE_MAX_SHADER_BUFFERS 32 #define PIPE_MAX_SHADER_IMAGES32 #define PIPE_MAX_TEXTURE_LEVELS 16 -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] draw: don't needlessly iterate through all sampler view slots
From: Roland Scheidegger We already stored the highest (potentially) used number. --- src/gallium/auxiliary/draw/draw_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 9791ec5..e887272 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -973,7 +973,7 @@ draw_set_sampler_views(struct draw_context *draw, for (i = 0; i < num; ++i) draw->sampler_views[shader_stage][i] = views[i]; - for (i = num; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; ++i) + for (i = num; i < draw->num_sampler_views[shader_stage]; ++i) draw->sampler_views[shader_stage][i] = NULL; draw->num_sampler_views[shader_stage] = num; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] tgsi: Recognize RET in main for tgsi_transform
From: Roland Scheidegger Shaders coming from dx10 state trackers have a RET before the END. And the epilog needs to be placed before the RET (otherwise it will get ignored). Hence figure out if a RET is in main, in this case we'll place the epilog there rather than before the END. (At a closer look, there actually seem to be problems with control flow in general with output redirection, that would need another look. It's enough however to fix draw's aa line emulation in some internal bug - lines tend to be drawn with trivial shaders, moving either a constant color or a vertex color directly to the output). v2: add assert so buggy handling of RET in main is detected --- src/gallium/auxiliary/tgsi/tgsi_transform.c | 62 + 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c index ffdad13..a13cf90 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_transform.c +++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c @@ -110,6 +110,9 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in, { uint procType; boolean first_instruction = TRUE; + boolean epilog_emitted = FALSE; + int cond_stack = 0; + int call_stack = 0; /* input shader */ struct tgsi_parse_context parse; @@ -166,22 +169,66 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in, { struct tgsi_full_instruction *fullinst = &parse.FullToken.FullInstruction; +unsigned opcode = fullinst->Instruction.Opcode; if (first_instruction && ctx->prolog) { ctx->prolog(ctx); } -/* XXX Note: we may also want to look for a main/top-level - * TGSI_OPCODE_RET instruction in the future. +/* + * XXX Note: we handle the case of ret in main. + * However, the output redirections done by transform + * have their limits with control flow and will generally + * not work correctly. e.g. + * if (cond) { + *oColor = x; + *ret; + * } + * oColor = y; + * end; + * If the color output is redirected to a temp and modified + * by a transform, this will not work (the oColor assignment + * in the conditional will never make it to the actual output). */ -if (fullinst->Instruction.Opcode == TGSI_OPCODE_END -&& ctx->epilog) { - /* Emit caller's epilog */ - ctx->epilog(ctx); - /* Emit END */ +if ((opcode == TGSI_OPCODE_END || opcode == TGSI_OPCODE_RET) && + call_stack == 0 && ctx->epilog && !epilog_emitted) { + if (opcode == TGSI_OPCODE_RET && cond_stack != 0) { + assert(!"transform ignoring RET in main"); + } else { + assert(cond_stack == 0); + /* Emit caller's epilog */ + ctx->epilog(ctx); + epilog_emitted = TRUE; + } + /* Emit END (or RET) */ ctx->emit_instruction(ctx, fullinst); } else { + switch (opcode) { + case TGSI_OPCODE_IF: + case TGSI_OPCODE_UIF: + case TGSI_OPCODE_SWITCH: + case TGSI_OPCODE_BGNLOOP: + cond_stack++; + break; + case TGSI_OPCODE_CAL: + call_stack++; + break; + case TGSI_OPCODE_ENDIF: + case TGSI_OPCODE_ENDSWITCH: + case TGSI_OPCODE_ENDLOOP: + assert(cond_stack > 0); + cond_stack--; + break; + case TGSI_OPCODE_ENDSUB: + assert(call_stack > 0); + call_stack--; + break; + case TGSI_OPCODE_BGNSUB: + case TGSI_OPCODE_RET: + default: + break; + } if (ctx->transform_instruction) ctx->transform_instruction(ctx, fullinst); else @@ -231,6 +278,7 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in, assert( 0 ); } } + assert(call_stack == 0); tgsi_parse_free (&parse); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] tgsi: Recognize RET in main for tgsi_transform
From: Roland Scheidegger Shaders coming from dx10 state trackers have a RET before the END. And the epilog needs to be placed before the RET (otherwise it will get ignored). Hence figure out if a RET is in main, in this case we'll place the epilog there rather than before the END. (At a closer look, there actually seem to be problems with control flow in general with output redirection, that would need another look. It's enough however to fix draw's aa line emulation in some internal bug - lines tend to be drawn with trivial shaders, moving either a constant color or a vertex color directly to the output). --- src/gallium/auxiliary/tgsi/tgsi_transform.c | 50 ++--- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c index ffdad13..94d872c 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_transform.c +++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c @@ -110,6 +110,8 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in, { uint procType; boolean first_instruction = TRUE; + boolean epilog_emitted = FALSE; + int stack_size = 0; /* input shader */ struct tgsi_parse_context parse; @@ -166,22 +168,60 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in, { struct tgsi_full_instruction *fullinst = &parse.FullToken.FullInstruction; +unsigned opcode = fullinst->Instruction.Opcode; if (first_instruction && ctx->prolog) { ctx->prolog(ctx); } -/* XXX Note: we may also want to look for a main/top-level - * TGSI_OPCODE_RET instruction in the future. +/* + * XXX Note: we handle the case of ret in main. + * However, the output redirections done by transform + * have their limits with control flow and will generally + * not work correctly. e.g. + * if (cond) { + *oColor = x; + *ret; + * } + * oColor = y; + * end; + * If the color output is redirected to a temp and modified + * by a transform, this will not work (the oColor assignment + * in the conditional will never make it to the actual output). */ -if (fullinst->Instruction.Opcode == TGSI_OPCODE_END -&& ctx->epilog) { +if ((opcode == TGSI_OPCODE_END || + (opcode == TGSI_OPCODE_RET && stack_size == 0)) +&& ctx->epilog && !epilog_emitted) { /* Emit caller's epilog */ ctx->epilog(ctx); - /* Emit END */ + epilog_emitted = TRUE; + /* Emit END (or RET) */ + if (opcode == TGSI_OPCODE_END) { + assert(stack_size == 0); + } ctx->emit_instruction(ctx, fullinst); } else { + switch (opcode) { + case TGSI_OPCODE_IF: + case TGSI_OPCODE_UIF: + case TGSI_OPCODE_SWITCH: + case TGSI_OPCODE_BGNLOOP: + case TGSI_OPCODE_CAL: + stack_size++; + break; + case TGSI_OPCODE_ENDIF: + case TGSI_OPCODE_ENDSWITCH: + case TGSI_OPCODE_ENDLOOP: + case TGSI_OPCODE_ENDSUB: + assert(stack_size > 0); + stack_size--; + break; + case TGSI_OPCODE_BGNSUB: + case TGSI_OPCODE_RET: + default: + break; + } if (ctx->transform_instruction) ctx->transform_instruction(ctx, fullinst); else -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] u_blit, u_simple_shaders: add shader to convert from xrbias format
From: Roland Scheidegger We need this to handle some oddball dx10 format (DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM). What you can do with this format is very limited, hence we don't want to add it as a gallium format (we could not express the properties of this format as ordinary format properties neither, so like all special formats it would need specific code for handling it in any case). While here, also nuke the array for different shaders for different writemasks, as it was not actually used (always full masks are passed in for generating shaders). --- src/gallium/auxiliary/util/u_blit.c | 40 +- src/gallium/auxiliary/util/u_blit.h | 3 +- src/gallium/auxiliary/util/u_simple_shaders.c | 48 +++ src/gallium/auxiliary/util/u_simple_shaders.h | 4 +++ 4 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index 3f92476..bf1dea7 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -65,7 +65,7 @@ struct blit_state struct pipe_vertex_element velem[2]; void *vs; - void *fs[PIPE_MAX_TEXTURE_TYPES][TGSI_WRITEMASK_XYZW + 1][3]; + void *fs[PIPE_MAX_TEXTURE_TYPES][4]; struct pipe_resource *vbuf; /**< quad vertices */ unsigned vbuf_slot; @@ -135,17 +135,15 @@ void util_destroy_blit(struct blit_state *ctx) { struct pipe_context *pipe = ctx->pipe; - unsigned i, j, k; + unsigned i, j; if (ctx->vs) pipe->delete_vs_state(pipe, ctx->vs); for (i = 0; i < ARRAY_SIZE(ctx->fs); i++) { for (j = 0; j < ARRAY_SIZE(ctx->fs[i]); j++) { - for (k = 0; k < ARRAY_SIZE(ctx->fs[i][j]); k++) { -if (ctx->fs[i][j][k]) - pipe->delete_fs_state(pipe, ctx->fs[i][j][k]); - } + if (ctx->fs[i][j]) +pipe->delete_fs_state(pipe, ctx->fs[i][j]); } } @@ -159,8 +157,9 @@ util_destroy_blit(struct blit_state *ctx) * Helper function to set the fragment shaders. */ static inline void -set_fragment_shader(struct blit_state *ctx, uint writemask, +set_fragment_shader(struct blit_state *ctx, enum pipe_format format, +boolean src_xrbias, enum pipe_texture_target pipe_tex) { enum tgsi_return_type stype; @@ -177,19 +176,29 @@ set_fragment_shader(struct blit_state *ctx, uint writemask, idx = 2; } - if (!ctx->fs[pipe_tex][writemask][idx]) { + if (src_xrbias) { + assert(stype == TGSI_RETURN_TYPE_FLOAT); + idx = 3; + if (!ctx->fs[pipe_tex][idx]) { + unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex, 0); + ctx->fs[pipe_tex][idx] = +util_make_fragment_tex_shader_xrbias(ctx->pipe, tgsi_tex); + } + } + + else if (!ctx->fs[pipe_tex][idx]) { unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex, 0); /* OpenGL does not allow blits from signed to unsigned integer * or vice versa. */ - ctx->fs[pipe_tex][writemask][idx] = + ctx->fs[pipe_tex][idx] = util_make_fragment_tex_shader_writemask(ctx->pipe, tgsi_tex, TGSI_INTERPOLATE_LINEAR, - writemask, + TGSI_WRITEMASK_XYZW, stype, stype, false, false); } - cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][writemask][idx]); + cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][idx]); } @@ -491,8 +500,8 @@ util_blit_pixels(struct blit_state *ctx, * The sampler view's first_layer indicate the layer to use, but for * cube maps it must point to the first face. Face is passed in src_face. * - * The main advantage over util_blit_pixels is that it allows to specify swizzles in - * pipe_sampler_view::swizzle_?. + * The main advantage over util_blit_pixels is that it allows to specify + * swizzles in pipe_sampler_view::swizzle_?. * * But there is no control over blitting Z and/or stencil. */ @@ -505,7 +514,8 @@ util_blit_pixels_tex(struct blit_state *ctx, struct pipe_surface *dst, int dstX0, int dstY0, int dstX1, int dstY1, - float z, uint filter) + float z, uint filter, + boolean src_xrbias) { boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT; struct pipe_framebuffer_state fb; @@ -593,7 +603,7 @@ util_blit_pixels_tex(struct blit_state *ctx, cso_set_sampler_views(ctx->cso, PIPE_SHADER_FRAGMENT, 1, &src_sampler_view); /* shaders */ - set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW, + set_fragment_shader(ctx, src_xrbias, src_sampler_view->format, src_sampler_view->texture->target
[Mesa-dev] [PATCH 1/2] u_simple_shaders: fix mask handling in util_make_fragment_tex_shader_writemask
From: Roland Scheidegger The writemask handling was busted, since writing defaults to output meant they got overwritten by the tex sampling anyway. Albeit the affected components were undefined, so maybe with some luck it still would have worked with some drivers - if not could as well kill it... (This would have affected u_blitter but not u_blit since the latter always used xyzw mask.) --- src/gallium/auxiliary/util/u_simple_shaders.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c index 9679545..a301c05 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.c +++ b/src/gallium/auxiliary/util/u_simple_shaders.c @@ -275,7 +275,7 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, if (writemask != TGSI_WRITEMASK_XYZW) { struct ureg_src imm = ureg_imm4f( ureg, 0, 0, 0, 1 ); - ureg_MOV( ureg, out, imm ); + ureg_MOV(ureg, temp, imm); } if (tex_target == TGSI_TEXTURE_BUFFER) -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/4] r600: partly fix sampleMaskIn value
From: Roland Scheidegger The hw gives us coverage for pixel, not for individual fragment shader invocations, in case execution isn't per pixel (note eg, unlike cm, actually cannot do "real" minSampleShading, it's either per-pixel or per-fragment, but it doesn't really make a difference here). Also, with msaa disabled, the hw still gives us a mask corresponding to the number of samples, where GL requires this to be 1. Fix this up by masking the sampleMaskIn bits with the bit corresponding to the sampleID, if we know this shader is always executed at per-sample granularity. (In case of a per-sample frequency shader and msaa disabled, the sampleID will always be 0, so this works just fine there.) Fixing this for the minSampleShading case will require a shader key (radeonsi uses the prolog part for this) (for eg, could get away with a single bit, cm would need either more bits depending on sample/invocation ratio, or read the bits from a uniform), unless we'd want to always use a sample mask uniform (which is probably not a good idea, as it would make the ordinary common msaa case slower for no good reason). This fixes some parts of piglit arb_sample_shading-samplemask (needs fixed test), in particular those which use a sampleID, while still failing others as expected. --- src/gallium/drivers/r600/r600_shader.c | 54 ++ 1 file changed, 54 insertions(+) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 1009411c62..8779f166aa 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1138,6 +1138,11 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off tgsi_parse_free(&parse); + if (ctx->info.reads_samplemask && + (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) { + inputs[1].enabled = true; + } + if (ctx->bc->chip_class >= EVERGREEN) { int num_baryc = 0; /* assign gpr to each interpolator according to priority */ @@ -3503,8 +3508,57 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, r = eg_load_helper_invocation(&ctx); if (r) return r; + } + + /* +* XXX this relies on fixed_pt_position_gpr only being present when +* this shader should be executed per sample. Should be the case for now... +*/ + if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) { + /* +* Fix up sample mask. The hw always gives us coverage mask for +* the pixel. However, for per-sample shading, we need the +* coverage for the shader invocation only. +* Also, with disabled msaa, only the first bit should be set +* (luckily the same fixup works for both problems). +* For now, we can only do it if we know this shader is always +* executed per sample (due to usage of bits in the shader +* forcing per-sample execution). +* If the fb is not multisampled, we'd do unnecessary work but +* it should still be correct. +* It will however do nothing for sample shading according +* to MinSampleShading. +*/ + struct r600_bytecode_alu alu; + int tmp = r600_get_temp(&ctx); + assert(ctx.face_gpr != -1); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + + alu.op = ALU_OP2_LSHL_INT; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0x1; + alu.src[1].sel = ctx.fixed_pt_position_gpr; + alu.src[1].chan = 3; + alu.dst.sel = tmp; + alu.dst.chan = 0; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) + return r; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_AND_INT; + alu.src[0].sel = tmp; + alu.src[1].sel = ctx.face_gpr; + alu.src[1].chan = 2; + alu.dst.sel = ctx.face_gpr; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) + return r; } + if (ctx.fragcoord_input >= 0) { if (ctx.bc->chip_class == CAYMAN) { for (j = 0 ; j < 4; j++) { -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/4] r600/cm: (trivial) code cleanup for emitting msaa state
From: Roland Scheidegger No functional change (compile tested only). --- src/gallium/drivers/r600/cayman_msaa.c | 14 ++ src/gallium/drivers/r600/evergreen_state.c | 10 ++ src/gallium/drivers/r600/r600_pipe_common.h | 6 ++ 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/r600/cayman_msaa.c b/src/gallium/drivers/r600/cayman_msaa.c index 6bc307a4bc..f97924ac22 100644 --- a/src/gallium/drivers/r600/cayman_msaa.c +++ b/src/gallium/drivers/r600/cayman_msaa.c @@ -141,7 +141,7 @@ void cayman_init_msaa(struct pipe_context *ctx) cayman_get_sample_position(ctx, 16, i, rctx->sample_locations_16x[i]); } -void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples) +static void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples) { switch (nr_samples) { default: @@ -202,9 +202,8 @@ void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples) } } -void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, -int ps_iter_samples, int overrast_samples, -unsigned sc_mode_cntl_1) +void cayman_emit_msaa_state(struct radeon_winsys_cs *cs, int nr_samples, + int ps_iter_samples, int overrast_samples) { int setup_samples = nr_samples > 1 ? nr_samples : overrast_samples > 1 ? overrast_samples : 0; @@ -216,6 +215,13 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, * endcaps. */ unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1); + unsigned sc_mode_cntl_1 = + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1); + + if (nr_samples > 1) { + cayman_emit_msaa_sample_locs(cs, nr_samples); + } if (setup_samples > 1) { /* indexed by log2(nr_samples) */ diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 385d017840..9620fa9e7a 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1948,14 +1948,8 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r if (rctx->b.chip_class == EVERGREEN) { evergreen_emit_msaa_state(rctx, rctx->framebuffer.nr_samples, rctx->ps_iter_samples); } else { - unsigned sc_mode_cntl_1 = - EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | - EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1); - - if (rctx->framebuffer.nr_samples > 1) - cayman_emit_msaa_sample_locs(cs, rctx->framebuffer.nr_samples); - cayman_emit_msaa_config(cs, rctx->framebuffer.nr_samples, - rctx->ps_iter_samples, 0, sc_mode_cntl_1); + cayman_emit_msaa_state(cs, rctx->framebuffer.nr_samples, + rctx->ps_iter_samples, 0); } } diff --git a/src/gallium/drivers/r600/r600_pipe_common.h b/src/gallium/drivers/r600/r600_pipe_common.h index 86a20f8639..ee8eb54920 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.h +++ b/src/gallium/drivers/r600/r600_pipe_common.h @@ -799,10 +799,8 @@ extern const unsigned eg_max_dist_4x; void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count, unsigned sample_index, float *out_value); void cayman_init_msaa(struct pipe_context *ctx); -void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples); -void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, -int ps_iter_samples, int overrast_samples, -unsigned sc_mode_cntl_1); +void cayman_emit_msaa_state(struct radeon_winsys_cs *cs, int nr_samples, + int ps_iter_samples, int overrast_samples); /* Inline helpers. */ -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/4] r600: clean up fragment shader input scan code
From: Roland Scheidegger For some reason, we were iterating through the code twice (first just for instructions needing barycentrics, then for instructions and input dcls). Move things around slightly so this is no longer necessary. There also was a unnedeed enabling of the fixed_pt_position_gpr - this is only needed if the per-sample interpolation comes from an input, not from an instruction (just move the assert where it belongs) (since the sample id to sample from comes from a tgsi src in this case, and isn't sampleID). Otherwise there should be no functional change. --- src/gallium/drivers/r600/r600_shader.c | 75 +++--- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 13aa681049..1009411c62 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -,7 +,6 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { location = TGSI_INTERPOLATE_LOC_CENTER; - inputs[1].enabled = true; /* needs SAMPLEID */ } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { location = TGSI_INTERPOLATE_LOC_CENTER; /* Needs sample positions, currently those are always available */ @@ -1139,6 +1138,19 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off tgsi_parse_free(&parse); + if (ctx->bc->chip_class >= EVERGREEN) { + int num_baryc = 0; + /* assign gpr to each interpolator according to priority */ + for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { + if (ctx->eg_interpolators[i].enabled) { + ctx->eg_interpolators[i].ij_index = num_baryc; + num_baryc++; + } + } + num_baryc = (num_baryc + 1) >> 1; + gpr_offset += num_baryc; + } + for (i = 0; i < ARRAY_SIZE(inputs); i++) { boolean enabled = inputs[i].enabled; int *reg = inputs[i].reg; @@ -1165,18 +1177,21 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off * for evergreen we need to scan the shader to find the number of GPRs we need to * reserve for interpolation and system values * - * we need to know if we are going to emit - * any sample or centroid inputs + * we need to know if we are going to emit any sample or centroid inputs * if perspective and linear are required */ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) { unsigned i; - int num_baryc; - struct tgsi_parse_context parse; memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); + /* +* Could get this information from the shader info. But right now +* we interpolate all declared inputs, whereas the shader info will +* only contain the bits if the inputs are actually used, so it might +* not be safe... +*/ for (i = 0; i < ctx->info.num_inputs; i++) { int k; /* skip position/face/mask/sampleid */ @@ -1193,53 +1208,9 @@ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) ctx->eg_interpolators[k].enabled = TRUE; } - if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { - return 0; - } - - /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ - while (!tgsi_parse_end_of_tokens(&parse)) { - tgsi_parse_token(&parse); - - if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { - const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || - inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || - inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) - { - int interpolate, location, k; - - if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { - location = TGSI_INTERPOLATE_LOC_CENTER; - } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { - location = TGSI_INTERPOLATE_LOC_CENTER; - } else { - location = TGSI_INTERPOLATE_LOC_CENTROID; -
[Mesa-dev] [PATCH 2/4] mesa: (trivial) remove unused ignore_sample_qualifier_parameter
From: Roland Scheidegger This parameter for _mesa_get_min_incations_per_fragment() was once used by the intel driver, but it's long gone. --- src/mesa/program/program.c| 11 --- src/mesa/program/program.h| 3 +-- src/mesa/state_tracker/st_atom_msaa.c | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c index 220efc3539..6aba3cb3f1 100644 --- a/src/mesa/program/program.c +++ b/src/mesa/program/program.c @@ -515,8 +515,7 @@ _mesa_find_free_register(const GLboolean used[], */ GLint _mesa_get_min_invocations_per_fragment(struct gl_context *ctx, - const struct gl_program *prog, - bool ignore_sample_qualifier) + const struct gl_program *prog) { /* From ARB_sample_shading specification: * "Using gl_SampleID in a fragment shader causes the entire shader @@ -534,11 +533,9 @@ _mesa_get_min_invocations_per_fragment(struct gl_context *ctx, * "Use of the "sample" qualifier on a fragment shader input * forces per-sample shading" */ - if (prog->info.fs.uses_sample_qualifier && !ignore_sample_qualifier) - return MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1); - - if (prog->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | - SYSTEM_BIT_SAMPLE_POS)) + if (prog->info.fs.uses_sample_qualifier || + (prog->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | +SYSTEM_BIT_SAMPLE_POS))) return MAX2(_mesa_geometric_samples(ctx->DrawBuffer), 1); else if (ctx->Multisample.SampleShading) return MAX2(ceil(ctx->Multisample.MinSampleShadingValue * diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index 376da7b2d4..659385f55b 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -108,8 +108,7 @@ _mesa_find_free_register(const GLboolean used[], extern GLint _mesa_get_min_invocations_per_fragment(struct gl_context *ctx, - const struct gl_program *prog, - bool ignore_sample_qualifier); + const struct gl_program *prog); static inline GLuint _mesa_program_enum_to_shader_stage(GLenum v) diff --git a/src/mesa/state_tracker/st_atom_msaa.c b/src/mesa/state_tracker/st_atom_msaa.c index 589e328ac5..556c7c5889 100644 --- a/src/mesa/state_tracker/st_atom_msaa.c +++ b/src/mesa/state_tracker/st_atom_msaa.c @@ -77,5 +77,5 @@ st_update_sample_shading(struct st_context *st) return; cso_set_min_samples(st->cso_context, - _mesa_get_min_invocations_per_fragment(st->ctx, &st->fp->Base, false)); + _mesa_get_min_invocations_per_fragment(st->ctx, &st->fp->Base)); } -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600: don't do stack workarounds for hemlock
From: Roland Scheidegger By the looks of it it seems hemlock is treated separately to cypress, but certainly it won't need the stack workarounds cedar/redwood (and seemingly every other eg chip except cypress/juniper) need. (Discovered by accident.) --- src/gallium/drivers/r600/sb/sb_bc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index b35671bf0f..a249395474 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -665,6 +665,7 @@ public: return false; switch (hw_chip) { + case HW_CHIP_HEMLOCK: case HW_CHIP_CYPRESS: case HW_CHIP_JUNIPER: return false; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/3] mesa: skip validation of legality of size/type queries for format queries
From: Roland Scheidegger The size/type query is always legal (if we made it that far). This causes a difference for GL_TEXTURE_BUFFER - the reason is that these parameters are valid only with GetTexLevelParameter() if gl 3.1 is supported, but not if only ARB_texture_buffer_object is supported. However, while the spec says that these queries return "the same information as querying GetTexLevelParameter" I believe we're not expected to return just zeros here. By definition, these pnames are always valid (unlike for the GetTexLevelParameter() function which would return an error without GL 3.1), so returning 0 but no error makes no sense to me. This breaks some piglit arb_internalformat_query2 tests (which I belive to be wrong). --- src/mesa/main/formatquery.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c index 2214f97e67..f345140518 100644 --- a/src/mesa/main/formatquery.c +++ b/src/mesa/main/formatquery.c @@ -960,9 +960,6 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname, mesa_format texformat; if (target != GL_RENDERBUFFER) { - if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, true)) -goto end; - baseformat = _mesa_base_tex_format(ctx, internalformat); } else { baseformat = _mesa_base_fbo_format(ctx, internalformat); -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/3] mesa: restrict formats being supported by target type for formatquery
From: Roland Scheidegger The code just considered all formats as being supported if they were either a valid fbo or texture format. This was quite awkward since then the query would return "supported" for e.g. GL_RGB9E5 or compressed formats and target RENDERBUFFER (albeit the driver could still refuse it in theory). However, when then querying for instance the internalformat sizes, it would just return 0 (due to the checks being more strict there). It was also a problem for texture buffer targets, which have a more restricted list of formats which are allowed (and again, it would return supported but then querying sizes would return 0). So only take validation of formats into account which make sense for a given target. Can also toss out some special checks for rgb9e5 later, since we'd never get there if it wasn't supported in the first place. --- src/mesa/main/formatquery.c | 31 +-- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c index e0062a64d2..2214f97e67 100644 --- a/src/mesa/main/formatquery.c +++ b/src/mesa/main/formatquery.c @@ -558,15 +558,29 @@ _is_internalformat_supported(struct gl_context *ctx, GLenum target, * implementation accepts it for any texture specification commands, and * - unsized or base internal format, if the implementation accepts * it for texture or image specification. +* +* But also: +* "If the particualar and combination do not make +* sense, or if a particular type of is not supported by the +* implementation the "unsupported" answer should be given. This is not an +* error. */ GLint buffer[1]; - /* At this point an internalformat is valid if it is valid as a texture or -* as a renderbuffer format. The checks are different because those methods -* return different values when passing non supported internalformats */ - if (_mesa_base_tex_format(ctx, internalformat) < 0 && - _mesa_base_fbo_format(ctx, internalformat) == 0) - return false; + if (target == GL_RENDERBUFFER) { + if (_mesa_base_fbo_format(ctx, internalformat) == 0) { + return false; + } + } else if (target == GL_TEXTURE_BUFFER) { + if (_mesa_validate_texbuffer_format(ctx, internalformat) == + MESA_FORMAT_NONE) { + return false; + } + } else { + if (_mesa_base_tex_format(ctx, internalformat) < 0) { + return false; + } + } /* Let the driver have the final word */ ctx->Driver.QueryInternalFormat(ctx, target, internalformat, @@ -969,10 +983,7 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname, * and glGetRenderbufferParameteriv functions. */ if (pname == GL_INTERNALFORMAT_SHARED_SIZE) { - if (_mesa_has_EXT_texture_shared_exponent(ctx) && - target != GL_TEXTURE_BUFFER && - target != GL_RENDERBUFFER && - texformat == MESA_FORMAT_R9G9B9E5_FLOAT) { + if (texformat == MESA_FORMAT_R9G9B9E5_FLOAT) { buffer[0] = 5; } goto end; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/3] mesa: remove misleading gles checks for formatquery
From: Roland Scheidegger Testing for gles there is just confusing - this is about target being supported, if it was valid at all was already determined earlier (in _legal_parameters). It didn't make sense at all in any case, since it would only have said false there for gles for 2d but not 2d arrays etc. --- src/mesa/main/formatquery.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c index 61f798c88f..e0062a64d2 100644 --- a/src/mesa/main/formatquery.c +++ b/src/mesa/main/formatquery.c @@ -392,14 +392,12 @@ _is_target_supported(struct gl_context *ctx, GLenum target) * implementation the "unsupported" answer should be given. * This is not an error." * -* For OpenGL ES, queries can only be used with GL_RENDERBUFFER or MS. +* Note that legality of targets has already been verified. */ switch(target){ case GL_TEXTURE_1D: case GL_TEXTURE_2D: case GL_TEXTURE_3D: - if (!_mesa_is_desktop_gl(ctx)) - return false; break; case GL_TEXTURE_1D_ARRAY: @@ -702,6 +700,12 @@ _mesa_query_internal_format_default(struct gl_context *ctx, GLenum target, case GL_FRAMEBUFFER_RENDERABLE_LAYERED: case GL_FRAMEBUFFER_BLEND: case GL_FILTER: + /* + * XXX seems a tad optimistic just saying yes to everything here. + * Even for combinations which make no sense... + * And things like TESS_CONTROL_TEXTURE should definitely default to + * NONE if the driver doesn't even support tessellation... + */ params[0] = GL_FULL_SUPPORT; break; case GL_NUM_TILING_TYPES_EXT: -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix crash with seamless cube filtering with different min/mag filter
From: Roland Scheidegger We are not allowed to modify the incoming coords values, or things may crash (as we may be inside a llvm conditional and the values may be used in another branch). I recently broke this when fixing an issue with NaNs and seamless cube map filtering, and it causes crashes when doing cubemap filtering if the min and mag filters are different. Add const to the pointers passed in to prevent this mishap in the future. Fixes: a485ad0bcd ("gallivm: fix an issue with NaNs with seamless cube filtering") --- src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 38 +-- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index ff8cbf6..8f760f5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -857,7 +857,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, LLVMValueRef mipoffsets, - LLVMValueRef *coords, + const LLVMValueRef *coords, const LLVMValueRef *offsets, LLVMValueRef colors_out[4]) { @@ -1004,7 +1004,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, LLVMValueRef mipoffsets, - LLVMValueRef *coords, + const LLVMValueRef *coords, const LLVMValueRef *offsets, LLVMValueRef colors_out[4]) { @@ -1106,7 +1106,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, struct lp_build_if_state edge_if; LLVMTypeRef int1t; LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2]; - LLVMValueRef coord, have_edge, have_corner; + LLVMValueRef coord0, coord1, have_edge, have_corner; LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y; LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp; LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped; @@ -1130,20 +1130,20 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, * other values might be bogus in the end too). * So kill off the NaNs here. */ - coords[0] = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero, - GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); - coords[1] = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero, - GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); - coord = lp_build_mul(coord_bld, coords[0], flt_width_vec); + coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero, +GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); + coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec); /* instead of clamp, build mask if overflowed */ - coord = lp_build_sub(coord_bld, coord, half); + coord0 = lp_build_sub(coord_bld, coord0, half); /* convert to int, compute lerp weight */ /* not ideal with AVX (and no AVX2) */ - lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart); + lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart); x1 = lp_build_add(ivec_bld, x0, ivec_bld->one); - coord = lp_build_mul(coord_bld, coords[1], flt_height_vec); - coord = lp_build_sub(coord_bld, coord, half); - lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart); + coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero, +GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); + coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec); + coord1 = lp_build_sub(coord_bld, coord1, half); + lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart); y1 = lp_build_add(ivec_bld, y0, ivec_bld->one); fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero); @@ -1747,7 +1747,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, unsigned img_filter, unsigned mip_filter, boolean is_gather, - LLVMValueRef *coords, + const LLVMValueRef *coords, const LLVMValueRef *offsets, LLVMValueRef ilevel0, LLVMValueRef ilevel1, @@ -1820,6 +1820,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, PIPE_FUNC_GREATER, lod_fpart, bld->lodf_bld.zero); need_lerp =
[Mesa-dev] [PATCH] r600: increase number of samplers/views from 16 to 18 on eg
From: Roland Scheidegger Some apps are known to require more than 16. Albeit they probably still won't run with 18 (since all new hw/drivers support 32) it shouldn't hurt to at least support 18 (seemingly the hw limit on all r600-ni chips - the blob also supports 18, at least for eg+ by the looks of it). Unfortunately border colors do not work for the last 2 units. The reg guide says there is a 5 bit index for setting border colors, but this is a lie. piglit max-samplers shows that indeed setting border color for units 16/17 (per stage) will simply overwrite the border color for units 0/1, and sampling will consequently also use those border color values for sampling on units 16/17. (For eg - no idea about ni.) This will cause piglit max-samplers border to fail, but meh... border colors are more or less totally busted (sampler swizzling...) on that hw anyway. Border colors should still work if not both units 0 and 16 (or units 1 and 17) use a border color simultaneously. Setting border color values on r600/r700 is different, and I have no idea if the hw would also wrap-around when trying to use border colors or do something crazy (like locking up...) so don't increase the limit there (since the blob doesn't do it I'm not sure if it would be safe). --- src/gallium/drivers/r600/evergreen_state.c | 7 +++ src/gallium/drivers/r600/r600_pipe.c | 6 +- src/gallium/drivers/r600/r600_pipe.h | 8 src/gallium/drivers/r600/r600_state_common.c | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index fb1de9cbf4..55a460053c 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2372,6 +2372,13 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx, radeon_emit(cs, (resource_id_base + i) * 3); radeon_emit_array(cs, rstate->tex_sampler_words, 3); + /* +* Note for sampler 16/17 this will overwrite border color +* on sampler 0/1. As long as border color isn't used on +* both units 0 and 16 (or 1 and 17) it should actually work +* since the sampler also appears to remap those border color +* values the same way. +*/ if (rstate->border_color_use) { radeon_set_config_reg_seq(cs, border_index_reg, 5); radeon_emit(cs, i); diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 95aa2e5383..7f9500ad4b 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -595,7 +595,11 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, return 1; case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return 16; + /* +* There is potentially even more trouble with border colors +* for units 16/17 on r600/r700, so only enable 18 on eg+ +*/ + return rscreen->b.family >= CHIP_CEDAR ? R600_NUM_TEX_UNITS : 16; case PIPE_SHADER_CAP_PREFERRED_IR: if (shader == PIPE_SHADER_COMPUTE) { return PIPE_SHADER_IR_NATIVE; diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 112b5cbb83..e2bd7b0a99 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -366,7 +366,7 @@ struct r600_pipe_sampler_state { }; /* needed for blitter save */ -#define NUM_TEX_UNITS 16 +#define R600_NUM_TEX_UNITS 18 struct r600_seamless_cube_map { struct r600_atomatom; @@ -375,7 +375,7 @@ struct r600_seamless_cube_map { struct r600_samplerview_state { struct r600_atomatom; - struct r600_pipe_sampler_view *views[NUM_TEX_UNITS]; + struct r600_pipe_sampler_view *views[R600_NUM_TEX_UNITS]; uint32_tenabled_mask; uint32_tdirty_mask; uint32_tcompressed_depthtex_mask; /* which textures are depth */ @@ -385,7 +385,7 @@ struct r600_samplerview_state { struct r600_sampler_states { struct r600_atomatom; - struct r600_pipe_sampler_state *states[NUM_TEX_UNITS]; + struct r600_pipe_sampler_state *states[R600_NUM_TEX_UNITS]; uint32_tenabled_mask; uint32_tdirty_mask; uint32_thas_bordercolor_mask; /* which states contain the border color */ @@ -394,7 +394,7 @@ struct r600_sampler_states { struct r600_textures_info { struct r600_samplerview_state views; struct r600_sampler_states states; -
[Mesa-dev] [PATCH] draw: remove VSPLIT_CREATE_IDX macro
From: Roland Scheidegger Just inline the little bit of code. --- src/gallium/auxiliary/draw/draw_pt_vsplit.c | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c index 3ff077b..653deab 100644 --- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c @@ -116,21 +116,15 @@ vsplit_get_base_idx(unsigned start, unsigned fetch) return draw_overflow_uadd(start, fetch, MAX_ELT_IDX); } -/* - * The final element index is just element index plus element bias. - */ -#define VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias)\ - unsigned elt_idx; \ - elt_idx = vsplit_get_base_idx(start, fetch);\ - elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + (int)elt_bias); - static inline void vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts, unsigned start, unsigned fetch, int elt_bias) { struct draw_context *draw = vsplit->draw; - VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias); + unsigned elt_idx; + elt_idx = vsplit_get_base_idx(start, fetch); + elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + elt_bias); /* unlike the uint case this can only happen with elt_bias */ if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) { unsigned hash = elt_idx % MAP_SIZE; @@ -145,7 +139,9 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts, unsigned start, unsigned fetch, int elt_bias) { struct draw_context *draw = vsplit->draw; - VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias); + unsigned elt_idx; + elt_idx = vsplit_get_base_idx(start, fetch); + elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + elt_bias); /* unlike the uint case this can only happen with elt_bias */ if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) { unsigned hash = elt_idx % MAP_SIZE; @@ -165,7 +161,12 @@ vsplit_add_cache_uint(struct vsplit_frontend *vsplit, const uint *elts, unsigned start, unsigned fetch, int elt_bias) { struct draw_context *draw = vsplit->draw; - VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias); + unsigned elt_idx; + /* +* The final element index is just element index plus element bias. +*/ + elt_idx = vsplit_get_base_idx(start, fetch); + elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + elt_bias); /* Take care for DRAW_MAX_FETCH_IDX (since cache is initialized to -1). */ if (elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) { unsigned hash = elt_idx % MAP_SIZE; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] draw: fix vsplit code when the (post-bias) index value is -1
From: Roland Scheidegger vsplit_add_cache uses the post-bias index for hashing, but the vsplit_add_cache_uint/ushort/ubyte ones used the pre-bias index, therefore the code for handling the special case (because -1 matches the initialization value of the cache) wasn't actually working. Commit 78a997f72841310620d18daa9015633343d04db1 actually simplified the cache logic somewhat, but it looks like this particular problem carried over (and duplicated to the ushort/ubyte cases, since before only uint needed it). This could lead to the vsplit cache doing the wrong thing, in particular later fetch_info might indicate there are 0 values to fetch. This only really affected edge cases which were bogus to begin with, but it could lead to a crash with the jit vertex shader, since it cannot handle this case correctly (the count loop is always executed at least once and we would not allocate any memory for the shader outputs), so add another assert to catch it there. --- src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c | 1 + src/gallium/auxiliary/draw/draw_pt_vsplit.c| 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index c6492a1..5e0c562 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -368,6 +368,7 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle, unsigned start_or_maxelt, vid_base; const unsigned *elts; + assert(fetch_info->count > 0); llvm_vert_info.count = fetch_info->count; llvm_vert_info.vertex_size = fpme->vertex_size; llvm_vert_info.stride = fpme->vertex_size; diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c index a68d5bf..3ff077b 100644 --- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c @@ -133,7 +133,7 @@ vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts, VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias); /* unlike the uint case this can only happen with elt_bias */ if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) { - unsigned hash = fetch % MAP_SIZE; + unsigned hash = elt_idx % MAP_SIZE; vsplit->cache.fetches[hash] = 0; vsplit->cache.has_max_fetch = TRUE; } @@ -148,7 +148,7 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, const ushort *elts, VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias); /* unlike the uint case this can only happen with elt_bias */ if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) { - unsigned hash = fetch % MAP_SIZE; + unsigned hash = elt_idx % MAP_SIZE; vsplit->cache.fetches[hash] = 0; vsplit->cache.has_max_fetch = TRUE; } @@ -168,7 +168,7 @@ vsplit_add_cache_uint(struct vsplit_frontend *vsplit, const uint *elts, VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias); /* Take care for DRAW_MAX_FETCH_IDX (since cache is initialized to -1). */ if (elt_idx == DRAW_MAX_FETCH_IDX && !vsplit->cache.has_max_fetch) { - unsigned hash = fetch % MAP_SIZE; + unsigned hash = elt_idx % MAP_SIZE; /* force update - any value will do except DRAW_MAX_FETCH_IDX */ vsplit->cache.fetches[hash] = 0; vsplit->cache.has_max_fetch = TRUE; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600: fix relocs for PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE query
From: Roland Scheidegger The command parser is very sad if we don't emit the relocs per hw query... However, don't enable it. It mostly works, but piglit arb_transform_feedback_overflow_query-basic shows 2 failures (it's really the same case for the hw), conditional_render_any and conditional_render_single. By some experimentation, it looks like the firmware combines the values wrongly for the non-inverted (i.e. hw-inverted) case - it will only not draw if all 4 streams overflow, rather than just at least one. Interestingly, radeonsi has a workaround for some VI firmware which looks like it was the exact same firmware bug. Hence, looks like it would need new firmware to properly fix this. (Tested on Juniper, not sure if firmware for all chips is broken.) --- src/gallium/drivers/r600/r600_query.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index b4519830cc..5ff0570308 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -742,9 +742,12 @@ static void r600_query_hw_do_emit_start(struct r600_common_context *ctx, emit_sample_streamout(cs, va, query->stream); break; case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: - for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) + for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) { emit_sample_streamout(cs, va + 32 * stream, stream); - break; + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, + RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); + } + return; case PIPE_QUERY_TIME_ELAPSED: /* Write the timestamp after the last draw is done. * (bottom-of-pipe) @@ -827,9 +830,12 @@ static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, break; case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE: va += 16; - for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) + for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) { emit_sample_streamout(cs, va + 32 * stream, stream); - break; + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, + RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); + } + return; case PIPE_QUERY_TIME_ELAPSED: va += 8; /* fall through */ -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] mesa: require at least 14 UBOs for GL 4.3
From: Roland Scheidegger ARB_ubo requires 12 UBOs (per stage) at least, but this limit has been raised by GL 4.3 to 14, so don't advertize GL 4.3 without it (only checking the vertex stage since all drivers probably have the same limit anyway for other stages). (piglit has minmax tests for that kind of thing, but they go only up to 3.3, so this won't really be noticed.) I think this currently should not affect any driver - r600 until very recently only supported 12 but now advertizes 14 too. --- src/mesa/main/version.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index 90c5c5f..68079f4 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -352,6 +352,7 @@ compute_version(const struct gl_extensions *extensions, extensions->ARB_transform_feedback_instanced); const bool ver_4_3 = (ver_4_2 && consts->GLSLVersion >= 430 && + consts->Program[MESA_SHADER_VERTEX].MaxUniformBlocks >= 14 && extensions->ARB_ES3_compatibility && extensions->ARB_arrays_of_arrays && extensions->ARB_compute_shader && -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] util: fix NORETURN for msvc, add HAVE_FUNC_ATTRIBUTE_NORETURN to c99_compat.h
From: Roland Scheidegger We've seen some problems internally due to macro redefinition. Fix this by adding HAVE_FUNC_ATTRIBUTE_NORETURN to c99_compat.h, and defining it for msvc. And avoid redefinition just in case. --- include/c99_compat.h | 1 + src/util/macros.h| 12 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/c99_compat.h b/include/c99_compat.h index cb690c6..81621a7 100644 --- a/include/c99_compat.h +++ b/include/c99_compat.h @@ -164,6 +164,7 @@ test_c99_compat_h(const void * restrict a, #define HAVE_FUNC_ATTRIBUTE_FORMAT 1 #define HAVE_FUNC_ATTRIBUTE_PACKED 1 #define HAVE_FUNC_ATTRIBUTE_ALIAS 1 +#define HAVE_FUNC_ATTRIBUTE_NORETURN 1 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) /* https://gcc.gnu.org/onlinedocs/gcc-4.3.6/gcc/Other-Builtins.html */ diff --git a/src/util/macros.h b/src/util/macros.h index 2a08407..5ce0e57 100644 --- a/src/util/macros.h +++ b/src/util/macros.h @@ -171,10 +171,14 @@ do { \ #define ATTRIBUTE_RETURNS_NONNULL #endif -#ifdef HAVE_FUNC_ATTRIBUTE_NORETURN -#define NORETURN __attribute__((__noreturn__)) -#else -#define NORETURN +#ifndef NORETURN +# ifdef _MSC_VER +#define NORETURN __declspec(noreturn) +# elif defined HAVE_FUNC_ATTRIBUTE_NORETURN +#define NORETURN __attribute__((__noreturn__)) +# else +#define NORETURN +# endif #endif #ifdef __cplusplus -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/3] r600: fix enabled_rb_mask on eg/cm
From: Roland Scheidegger For eg/cm, the r600_gb_backend_map will always be 0. I assume this is a bug in the drm kernel driver, as it just just never fills the information in. I am not entirely sure if the map is supposed to be needed for these chips, since unlike on r600/r700 the value calculated for the map is in fact written to the GB_BACKEND_MAP reg, for which I am unable to dig up any documentation. In any case, this causes r600_query_hw_prepare_buffer to write the "status bit" (just the highest bit of the occlusion query result) even for active rbes (all but the first). This doesn't make much sense, albeit I suppose it's mostly safe. According to the commit history, it's necessary to set these bits for inactive rbes since otherwise predication will lock up - presumably the hw just is waiting for the status bit to appear, which will never happen with inactive rbes. I'd guess potentially predication could be wrong (due to not waiting for the actual result if the status bit is already there) if this is set for active rbes. Discovered while trying to fix predication lockups on Juniper (needs another patch). --- src/gallium/drivers/r600/r600_query.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index 987da9a806..699404b10d 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -1834,8 +1834,14 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) assert(rscreen->chip_class <= CAYMAN); - /* if backend_map query is supported by the kernel */ - if (rscreen->info.r600_gb_backend_map_valid) { + /* +* if backend_map query is supported by the kernel. +* Note the kernel drm driver (as of now) never fills in the associated +* data on eg/cm, only r600/r700, hence ignore the valid bit there. +* (Albeit some chips with just one active rb can have a valid 0 map.) +*/ + if (rscreen->info.r600_gb_backend_map_valid && + (ctx->chip_class < EVERGREEN || rscreen->info.r600_gb_backend_map != 0)) { unsigned num_tile_pipes = rscreen->info.num_tile_pipes; unsigned backend_map = rscreen->info.r600_gb_backend_map; unsigned item_width, item_mask; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/3] r600: hack up num_render_backends on Juniper to 8
From: Roland Scheidegger Juniper really has a maximum of 4 RBEs (16 pixels). However, predication always locks up on my HD 5750, and through experiments it looks like if we're pretending it has a maximum of 8, with 4 disabled, it works correctly. My conclusion would be that there's a bug (likely firmware, not hw) which causes the predication logic to try to read 8 results out of the query buffer instead of just 4, and since of course noone ever writes the upper 4, the status bit is never set and hence it will wait for it forever. Ideally this would be fixed in firmware, but I'd guess chances of that happening are slim. This will double the size of (occlusion) query result buffers, write the status bit for the disabled rbs in these buffers, and will also add 8 results together instead of just 4 when reading them back. The latter is unnecessary, but it's probably not worth bothering - luckily num_render_backends isn't used outside of occlusion queries, so don't need separate value for the "real" maximum. Also print out the enabled_rb_mask if it changed from the pre-fixed value (which is already printed out), just in case there's some more problems with chips which have some rbs disabled... This fixes all the lockups with piglit nv_conditional_render tests on my HD 5750 (all pass). --- src/gallium/drivers/r600/r600_query.c | 21 +++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index 699404b10d..6fc00819b1 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -1830,7 +1830,19 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) struct r600_resource *buffer; uint32_t *results; unsigned i, mask = 0; - unsigned max_rbs = ctx->screen->info.num_render_backends; + unsigned max_rbs; + + if (ctx->family == CHIP_JUNIPER) { + /* +* Fix for predication lockups - the chip can only ever have +* 4 RBs, however it looks like the predication logic assumes +* there's 8, trying to read results from query buffers never +* written to. By increasing this number we'll write the +* status bit for these as per the normal disabled rb logic. +*/ + ctx->screen->info.num_render_backends = 8; + } + max_rbs = ctx->screen->info.num_render_backends; assert(rscreen->chip_class <= CAYMAN); @@ -1901,8 +1913,13 @@ void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen) r600_resource_reference(&buffer, NULL); - if (mask) + if (mask) { + if (rscreen->debug_flags & DBG_INFO && + mask != rscreen->info.enabled_rb_mask) { + printf("enabled_rb_mask (fixed) = 0x%x\n", mask); + } rscreen->info.enabled_rb_mask = mask; + } } #define XFULL(name_, query_type_, type_, result_type_, group_id_) \ -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/3] winsys/radeon: fix up default enabled_rb_mask for r600
From: Roland Scheidegger The logic had two fatal flaws which completely killed the default value. 1) drm will overwrite the value anyway even if the chip can't be handled 2) the default value logic is relying on num_render_backends, which was filled in later. Luckily noone is relying on it, but it's a bit confusing seeing the chip clock printed out there (as hex) with R600_DEBUG=info... (Albeit radeonsi does not appear to fix up the value. If kernels which don't handle this query are still supported, radeonsi will still end up with a broken enabled_rb_mask, I have no idea of the potential results of this there.) --- src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index e600199d26..10f2ecc900 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -369,12 +369,6 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) &ws->info.max_shader_clock); ws->info.max_shader_clock /= 1000; -/* Default value. */ -ws->info.enabled_rb_mask = u_bit_consecutive(0, ws->info.num_render_backends); -/* This fails on non-GCN or older kernels: */ -radeon_get_drm_value(ws->fd, RADEON_INFO_SI_BACKEND_ENABLED_MASK, NULL, - &ws->info.enabled_rb_mask); - ws->num_cpus = sysconf(_SC_NPROCESSORS_ONLN); /* Generation-specific queries. */ @@ -433,6 +427,16 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) &ws->info.r600_gb_backend_map)) ws->info.r600_gb_backend_map_valid = true; +/* Default value. */ +ws->info.enabled_rb_mask = u_bit_consecutive(0, ws->info.num_render_backends); +/* + * This fails (silently) on non-GCN or older kernels, overwriting the + * default enabled_rb_mask with the result of the last query. +*/ +if (ws->gen >= DRV_SI) +radeon_get_drm_value(ws->fd, RADEON_INFO_SI_BACKEND_ENABLED_MASK, NULL, + &ws->info.enabled_rb_mask); + ws->info.has_virtual_memory = false; if (ws->info.drm_minor >= 13) { uint32_t ib_vm_max_size; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] r600: RFC: use GET_BUFFER_RESINFO vtx fetch on eg instead of setting up consts
From: Roland Scheidegger Contrary to what the comment said, this appears to work just fine on my rv770 (tested with piglit textureSize 140 fs/vs samplerBuffer). I have no clue though if it's actually preferrable to use it (unfortunately we cannot get rid of the tex constants completely, as we still require them for cube map txq). Albeit filling in the format (1 channels or 4?) and the stuff related to mega- or mini-fetch (what the hell is this...) is just a guess based on other usage of vtx fetch instructions... v2: it really needs to be done through texture cache (I botched the testing because sb optimizations turned it automatically into tc, but can't rely on it and isn't happening on tes). Tested-by: Konstantin Kharlamov --- src/gallium/drivers/r600/evergreen_state.c | 7 ++-- src/gallium/drivers/r600/r600_asm.c | 3 +- src/gallium/drivers/r600/r600_shader.c | 59 ++-- src/gallium/drivers/r600/r600_state_common.c | 39 +++--- 4 files changed, 50 insertions(+), 58 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index f5b8e7115d..f645791a2c 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -653,11 +653,12 @@ static void evergreen_fill_buffer_resource_words(struct r600_context *rctx, S_030008_ENDIAN_SWAP(endian); tex_resource_words[3] = swizzle_res | S_03000C_UNCACHED(params->uncached); /* -* in theory dword 4 is for number of elements, for use with resinfo, -* but it seems to utterly fail to work, the amd gpu shader analyser +* dword 4 is for number of elements, for use with resinfo, +* albeit the amd gpu shader analyser * uses a const buffer to store the element sizes for buffer txq */ - tex_resource_words[4] = 0; + tex_resource_words[4] = params->size / stride; + tex_resource_words[5] = tex_resource_words[6] = 0; tex_resource_words[7] = S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER); } diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index d6bd561f01..92c2bdf27c 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -1510,7 +1510,8 @@ int cm_bytecode_add_cf_end(struct r600_bytecode *bc) /* common to all 3 families */ static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id) { - bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | + bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(vtx->op) | + S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) | S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) | S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 8a36bcf1b4..d349c9d7f1 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -6949,31 +6949,48 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - struct r600_bytecode_alu alu; int r; int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; + int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; - if (ctx->bc->chip_class >= EVERGREEN) { - /* with eg each dword is either buf size or number of cubes */ - alu.src[0].sel += id / 4; - alu.src[0].chan = id % 4; - } else { + if (ctx->bc->chip_class < EVERGREEN) { + struct r600_bytecode_alu alu; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; /* r600 we have them at channel 2 of the second dword */ alu.src[0].sel += (id * 2) + 1; alu.src[0].chan = 1; + alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; + } else { + struct r600_bytecode_vtx vtx; + memset(&vtx, 0, sizeof(vtx)); + vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */ + vtx.buffer_id = id + R600_MAX_CONST_BU
[Mesa-dev] [PATCH 4/6] r600: RFC: use GET_BUFFER_RESINFO vtx fetch on eg instead of setting up consts
From: Roland Scheidegger Contrary to what the comment said, this appears to work just fine on my rv770 (tested with piglit textureSize 140 fs/vs samplerBuffer). I have no clue though if it's actually preferrable to use it (unfortunately we cannot get rid of the tex constants completely, as we still require them for cube map txq). Albeit filling in the format (1 channels or 4?) and the stuff related to mega- or mini-fetch (what the hell is this...) is just a guess based on other usage of vtx fetch instructions... The docs (for eg, not cayman) suggests this has to be done through tc cache but it seems to work either way (since it actually just fetches the value from the buffer descriptor I'm not sure why caches would be involved). --- src/gallium/drivers/r600/evergreen_state.c | 7 ++-- src/gallium/drivers/r600/r600_asm.c | 3 +- src/gallium/drivers/r600/r600_shader.c | 59 ++-- src/gallium/drivers/r600/r600_state_common.c | 39 +++--- 4 files changed, 50 insertions(+), 58 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index f5b8e7115d..f645791a2c 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -653,11 +653,12 @@ static void evergreen_fill_buffer_resource_words(struct r600_context *rctx, S_030008_ENDIAN_SWAP(endian); tex_resource_words[3] = swizzle_res | S_03000C_UNCACHED(params->uncached); /* -* in theory dword 4 is for number of elements, for use with resinfo, -* but it seems to utterly fail to work, the amd gpu shader analyser +* dword 4 is for number of elements, for use with resinfo, +* albeit the amd gpu shader analyser * uses a const buffer to store the element sizes for buffer txq */ - tex_resource_words[4] = 0; + tex_resource_words[4] = params->size / stride; + tex_resource_words[5] = tex_resource_words[6] = 0; tex_resource_words[7] = S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER); } diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index d6bd561f01..92c2bdf27c 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -1510,7 +1510,8 @@ int cm_bytecode_add_cf_end(struct r600_bytecode *bc) /* common to all 3 families */ static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id) { - bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | + bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(vtx->op) | + S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) | S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) | S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 8a36bcf1b4..51c38a6e00 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -6949,31 +6949,48 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - struct r600_bytecode_alu alu; int r; int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; + int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; - if (ctx->bc->chip_class >= EVERGREEN) { - /* with eg each dword is either buf size or number of cubes */ - alu.src[0].sel += id / 4; - alu.src[0].chan = id % 4; - } else { + if (ctx->bc->chip_class < EVERGREEN) { + struct r600_bytecode_alu alu; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; /* r600 we have them at channel 2 of the second dword */ alu.src[0].sel += (id * 2) + 1; alu.src[0].chan = 1; + alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; + } else { + struct r600_bytecode_vtx vtx; + memset(&vtx, 0, sizeof(vtx)); + vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */ + vtx.buffer_id = id + R600_MAX_CONST_
[Mesa-dev] [PATCH 2/6] r600: don't use vtx offset for load_sample_position
From: Roland Scheidegger The offset looks bogus to me. Albeit in the end it doesn't matter, by the looks of it offsets smaller than 4 get ignored there (not sure of the rules, I suppose either non-dword aligned offsets never work there or the offset must be at least aligned to the size of a single element). --- src/gallium/drivers/r600/r600_shader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index e28882b2e5..792da950b3 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -1284,7 +1284,7 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_ vtx.num_format_all = 2; vtx.format_comp_all = 1; vtx.use_const_fields = 0; - vtx.offset = 1; // first element is size of buffer + vtx.offset = 0; vtx.endian = r600_endian_swap(32); vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/6] r600: fix sampler indexing with texture buffers sampling
From: Roland Scheidegger This fixes the new piglit test. (I could not actually figure out where the hell that index_1 parameter comes from but in any case it's completely the same as for ordinary texturing...) While here also fix up the logic for early exit of setting up driver consts. --- src/gallium/drivers/r600/r600_shader.c | 2 ++ src/gallium/drivers/r600/r600_state_common.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 792da950b3..8a36bcf1b4 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -6856,6 +6856,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; int src_gpr, r, i; int id = tgsi_tex_get_src_gpr(ctx, 1); + int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE src_gpr = tgsi_tex_get_src_gpr(ctx, 0); if (src_requires_loading) { @@ -6887,6 +6888,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ vtx.use_const_fields = 1; + vtx.buffer_index_mode = sampler_index_mode; if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) return r; diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index e9dd80fa96..4429246d31 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -1380,8 +1380,8 @@ void eg_setup_buffer_constants(struct r600_context *rctx, int shader_type) } if (!samplers->views.dirty_buffer_constants && - (images && !images->dirty_buffer_constants) && - (buffers && !buffers->dirty_buffer_constants)) + !(images && images->dirty_buffer_constants) && + !(buffers && buffers->dirty_buffer_constants)) return; if (images) -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/6] r600: increase number of ubos by one to 14
From: Roland Scheidegger Ideally we'd support 16 (d3d11 requires 15, and mesa subtracts one for non-ubo constants), but that's kind of impossible (it would be only doable if either we'd somehow merge the mesa non-ubo constants with the driver constants, or only use the driver constants with vtx fetch instead of through the kcache mechanism - the latter probably wouldn't be too bad). For now just do as the comment already said, place the gs ring (not really a const buffer in any case) which is only ever referred to through vc fetch clauses at index 16. Throw in a couple asserts for good measure to make sure the hw limit isn't exceeded. --- src/gallium/drivers/r600/evergreen_state.c | 1 + src/gallium/drivers/r600/r600_asm.c| 1 + src/gallium/drivers/r600/r600_pipe.h | 10 ++ src/gallium/drivers/r600/r600_state.c | 1 + 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 81b7c4a285..f5b8e7115d 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2168,6 +2168,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, va = rbuffer->gpu_address + cb->buffer_offset; if (!gs_ring_buffer) { + assert(buffer_index < R600_MAX_HW_CONST_BUFFERS); radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4, DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags); radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8, diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 69b2d142c1..d6bd561f01 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -1008,6 +1008,7 @@ static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc, continue; bank = alu->src[i].kc_bank; + assert(bank < R600_MAX_HW_CONST_BUFFERS); line = (sel-512)>>4; index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index e042edf2b4..cb84bc1998 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -69,11 +69,12 @@ #define R600_MAX_DRAW_CS_DWORDS58 #define R600_MAX_PFP_SYNC_ME_DWORDS16 -#define R600_MAX_USER_CONST_BUFFERS 13 +#define EG_MAX_ATOMIC_BUFFERS 8 + +#define R600_MAX_USER_CONST_BUFFERS 14 #define R600_MAX_DRIVER_CONST_BUFFERS 3 #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + R600_MAX_DRIVER_CONST_BUFFERS) - -#define EG_MAX_ATOMIC_BUFFERS 8 +#define R600_MAX_HW_CONST_BUFFERS 16 /* start driver buffers after user buffers */ #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS) @@ -84,7 +85,8 @@ #define R600_LDS_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS + 1) /* * Note GS doesn't use a constant buffer binding, just a resource index, - * so it's fine to have it exist at index 16. + * so it's fine to have it exist at index 16. I.e. it's not actually + * a const buffer, just a buffer resource. */ #define R600_GS_RING_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS + 2) /* Currently R600_MAX_CONST_BUFFERS just fits on the hw, which has a limit diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 253ff57a98..89cf7d2e50 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -1712,6 +1712,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, offset = cb->buffer_offset; if (!gs_ring_buffer) { + assert(buffer_index < R600_MAX_HW_CONST_BUFFERS); radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4, DIV_ROUND_UP(cb->buffer_size, 256)); radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8); -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/6] r600: increase number of UBOs to 15
From: Roland Scheidegger With the exception of the default tess levels only ever accessed by the default tcs shader, the LDS_INFO const buffer was only accessed by vtx instructions, and not through kcache. No idea why really, but use this to our advantage by not using a constant buffer slot for it. This just requires us to throw the default tess levels into the "normal" driver const buffer instead. Alternatively, could acesss those constants via vtx instructions too, but then we couldn't use a ordinary ureg prog accessing them as constants and would have to generate that directly when compiling the default tcs shader. (Another alternative would be to put all lds info into the ordinary driver const buffer, albeit we'd maybe need to increase the fixed size as it can't fit alongside the ucp since vs needs access to the lds info too.) --- src/gallium/drivers/r600/evergreen_state.c | 15 -- src/gallium/drivers/r600/r600_pipe.h | 13 src/gallium/drivers/r600/r600_state_common.c | 31 +--- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index f645791a2c..4cc48dfa11 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2168,8 +2168,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, va = rbuffer->gpu_address + cb->buffer_offset; - if (!gs_ring_buffer) { - assert(buffer_index < R600_MAX_HW_CONST_BUFFERS); + if (buffer_index < R600_MAX_HW_CONST_BUFFERS) { radeon_set_context_reg_flag(cs, reg_alu_constbuf_size + buffer_index * 4, DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags); radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8, @@ -3880,7 +3879,7 @@ static void evergreen_set_tess_state(struct pipe_context *ctx, memcpy(rctx->tess_state, default_outer_level, sizeof(float) * 4); memcpy(rctx->tess_state+4, default_inner_level, sizeof(float) * 2); - rctx->tess_state_dirty = true; + rctx->driver_consts[PIPE_SHADER_TESS_CTRL].tcs_default_levels_dirty = true; } static void evergreen_setup_immed_buffer(struct r600_context *rctx, @@ -4344,7 +4343,7 @@ void evergreen_setup_tess_constants(struct r600_context *rctx, const struct pipe unsigned input_vertex_size, output_vertex_size; unsigned input_patch_size, pervertex_output_patch_size, output_patch_size; unsigned output_patch0_offset, perpatch_output_offset, lds_size; - uint32_t values[16]; + uint32_t values[8]; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes; unsigned wave_divisor = (16 * num_pipes); @@ -4364,7 +4363,6 @@ void evergreen_setup_tess_constants(struct r600_context *rctx, const struct pipe if (rctx->lds_alloc != 0 && rctx->last_ls == ls && - !rctx->tess_state_dirty && rctx->last_num_tcs_input_cp == num_tcs_input_cp && rctx->last_tcs == tcs) return; @@ -4411,17 +4409,12 @@ void evergreen_setup_tess_constants(struct r600_context *rctx, const struct pipe rctx->lds_alloc = (lds_size | (num_waves << 14)); - memcpy(&values[8], rctx->tess_state, 6 * sizeof(float)); - values[14] = 0; - values[15] = 0; - - rctx->tess_state_dirty = false; rctx->last_ls = ls; rctx->last_tcs = tcs; rctx->last_num_tcs_input_cp = num_tcs_input_cp; constbuf.user_buffer = values; - constbuf.buffer_size = 16 * 4; + constbuf.buffer_size = 8 * 4; rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_VERTEX, R600_LDS_INFO_CONST_BUFFER, &constbuf); diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index cb84bc1998..112b5cbb83 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -71,7 +71,7 @@ #define EG_MAX_ATOMIC_BUFFERS 8 -#define R600_MAX_USER_CONST_BUFFERS 14 +#define R600_MAX_USER_CONST_BUFFERS 15 #define R600_MAX_DRIVER_CONST_BUFFERS 3 #define R600_MAX_CONST_BUFFERS (R600_MAX_USER_CONST_BUFFERS + R600_MAX_DRIVER_CONST_BUFFERS) #define R600_MAX_HW_CONST_BUFFERS 16 @@ -80,12 +80,17 @@ #define R600_BUFFER_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS) #define R600_UCP_SIZE (4*4*8) #define R600_CS_BLOCK_GRID_SIZE (8 * 4) +#define R600_TCS_DEFAULT_LEVELS_SIZE (6 * 4) #define R600_BUFFER_INFO_OFFSET (R600_UCP_SIZE) +/* + * We only access this buffer through vtx clauses hence it's fine to exist + * at index beyond 15. + */ #define R600_LDS_INFO_CONST_BUFFER (R600_MAX_USER_CONST_BUFFERS + 1) /* * Note GS doesn't use a constant buffer b
[Mesa-dev] [PATCH 6/6] r600: don't emit tes samplers/views when tes isn't active
From: Roland Scheidegger Similar to const buffers. The driver must not emit any tes-related state if tes is disabled, since the hw slots are all shared by VS, therefore it would overwrite them (the mesa state tracker might not do this, but it would be perfectly legal to do so). Nevertheless I think the dirty state tracking logic in the driver is fundamentally flawed when tes is disabled/enabled, since it looks to me like the VS (and TES) state would not get reemitted to the correct slots (if it's not dirty anyway). Unless I'm missing something... Theoretically, the overwrite problem could be solved by using non-overlapping resource slots for TES and VS (since we're not even close to using half the resource slots), but it wouldn't work for constant buffers nor samplers, and for VS would still need to propagate changes to both LS and VS, so probably not a useful idea. Unfortunately there's zero coverage of this with piglit, since all tessellation shader tests are just shader_runner tests, which are unsuitable for testing any kind of state dependency tracking issues (so I can't even quickly hack something up to proove it and fix it...). TCS otoh is just fine - like GS it has its own hw slots. --- src/gallium/drivers/r600/evergreen_state.c | 4 src/gallium/drivers/r600/r600_state_common.c | 15 +++ 2 files changed, 19 insertions(+) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 4cc48dfa11..fb1de9cbf4 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2334,6 +2334,8 @@ static void evergreen_emit_tcs_sampler_views(struct r600_context *rctx, struct r static void evergreen_emit_tes_sampler_views(struct r600_context *rctx, struct r600_atom *atom) { + if (!rctx->tes_shader) + return; evergreen_emit_sampler_views(rctx, &rctx->samplers[PIPE_SHADER_TESS_EVAL].views, EG_FETCH_CONSTANTS_OFFSET_VS + R600_MAX_CONST_BUFFERS, 0); } @@ -2404,6 +2406,8 @@ static void evergreen_emit_tcs_sampler_states(struct r600_context *rctx, struct static void evergreen_emit_tes_sampler_states(struct r600_context *rctx, struct r600_atom *atom) { + if (!rctx->tes_shader) + return; evergreen_emit_sampler_states(rctx, &rctx->samplers[PIPE_SHADER_TESS_EVAL], 18, R_00A414_TD_VS_SAMPLER0_BORDER_INDEX, 0); } diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 4364350487..a434156c16 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -1723,6 +1723,21 @@ static bool r600_update_derived_state(struct r600_context *rctx) UPDATE_SHADER_CLIP(R600_HW_STAGE_VS, vs); } } + + /* +* XXX: I believe there's some fatal flaw in the dirty state logic when +* enabling/disabling tes. +* VS/ES share all buffer/resource/sampler slots. If TES is enabled, +* it will therefore overwrite the VS slots. If it now gets disabled, +* the VS needs to rebind all buffer/resource/sampler slots - not only +* has TES overwritten the corresponding slots, but when the VS was +* operating as LS the things with correpsonding dirty bits got bound +* to LS slots and won't reflect what is dirty as VS stage even if the +* TES didn't overwrite it. The story for re-enabled TES is similar. +* In any case, we're not allowed to submit any TES state when +* TES is disabled (the state tracker may not do this but this looks +* like an optimization to me, not something which can be relied on). +*/ /* Update clip misc state. */ if (clip_so_current) { -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/3] r600: set up constants needed for txq for buffers and cube maps with tes
From: Roland Scheidegger We only did this for the other stages, but obviously tess eval/ctrl need it too. This fixes the (newly modified) piglit texturing/textureSize test when run with tes stage and bufferSampler. --- src/gallium/drivers/r600/r600_state_common.c | 16 1 file changed, 16 insertions(+) diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index e7fa1bbf57..e9dd80fa96 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -1812,6 +1812,22 @@ static bool r600_update_derived_state(struct r600_context *rctx) } } + if (rctx->tes_shader) { + assert(rctx->b.chip_class >= EVERGREEN); + need_buf_const = rctx->tes_shader->current->shader.uses_tex_buffers || + rctx->tes_shader->current->shader.has_txq_cube_array_z_comp; + if (need_buf_const) { + eg_setup_buffer_constants(rctx, PIPE_SHADER_TESS_EVAL); + } + if (rctx->tcs_shader) { + need_buf_const = rctx->tcs_shader->current->shader.uses_tex_buffers || + rctx->tcs_shader->current->shader.has_txq_cube_array_z_comp; + if (need_buf_const) { + eg_setup_buffer_constants(rctx, PIPE_SHADER_TESS_CTRL); + } + } + } + r600_update_driver_const_buffers(rctx, false); if (rctx->b.chip_class < EVERGREEN && rctx->ps_shader && rctx->vs_shader) { -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/3] r600: support 32 vertex attribs for evergreen
From: Roland Scheidegger Evergreen clearly has 32 slots, so it should just work (and the affected array is already sized with PIPE_MAX_ATTRIB). Note: As dx10.1 chips, r600/r700 should support this too, but seemingly there's only 16 resource slots for fetch shaders (fs). However, a quick looks seems to suggest the fs slots are actually shared with vs and not separate (as the fetch shader uses a offset of 160 on these chips), therefore (we're not even close to using all vs slots) just using different offsets might work, but I cannot verify this. No piglit change. --- src/gallium/drivers/r600/r600_pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 2583c719a3..c294973e8b 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -559,7 +559,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: return 32; case PIPE_SHADER_CAP_MAX_INPUTS: - return shader == PIPE_SHADER_VERTEX ? 16 : 32; + return shader == PIPE_SHADER_VERTEX ? (rscreen->b.family >= CHIP_CEDAR ? 32 : 16) : 32; case PIPE_SHADER_CAP_MAX_OUTPUTS: return shader == PIPE_SHADER_FRAGMENT ? 8 : 32; case PIPE_SHADER_CAP_MAX_TEMPS: -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/3] r600: don't emit reloc for ring buffer out into the blue
From: Roland Scheidegger It looks like this reloc belongs to setting the constant reg, which is skipped for gs ring. --- src/gallium/drivers/r600/evergreen_state.c | 7 +++ src/gallium/drivers/r600/r600_state.c | 7 +++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 0da665f634..81b7c4a285 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2172,12 +2172,11 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, DIV_ROUND_UP(cb->buffer_size, 256), pkt_flags); radeon_set_context_reg_flag(cs, reg_alu_const_cache + buffer_index * 4, va >> 8, pkt_flags); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, + RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); } - radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); - radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags); radeon_emit(cs, (buffer_id_base + buffer_index) * 8); radeon_emit(cs, va); /* RESOURCEi_WORD0 */ diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index cbf860f45f..253ff57a98 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -1715,12 +1715,11 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, radeon_set_context_reg(cs, reg_alu_constbuf_size + buffer_index * 4, DIV_ROUND_UP(cb->buffer_size, 256)); radeon_set_context_reg(cs, reg_alu_const_cache + buffer_index * 4, offset >> 8); + radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, + RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); } - radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, - RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); - radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0)); radeon_emit(cs, (buffer_id_base + buffer_index) * 7); radeon_emit(cs, offset); /* RESOURCEi_WORD0 */ -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] r600: kill off native_integer shader ctx flag
From: Roland Scheidegger Maybe upon a time it wasn't always true. --- src/gallium/drivers/r600/r600_shader.c | 18 -- 1 file changed, 18 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 06d7ca02e9..6cdbfd3063 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -350,7 +350,6 @@ struct r600_shader_ctx { int cs_grid_size_reg; bool cs_block_size_loaded, cs_grid_size_loaded; int fragcoord_input; - int native_integers; int next_ring_offset; int gs_out_ring_offset; int gs_next_vertex; @@ -998,22 +997,6 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) { break; /* Already handled from allocate_system_value_inputs */ } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { - if (!ctx->native_integers) { - struct r600_bytecode_alu alu; - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - - alu.op = ALU_OP1_INT_TO_FLT; - alu.src[0].sel = 0; - alu.src[0].chan = 3; - - alu.dst.sel = 0; - alu.dst.chan = 3; - alu.dst.write = 1; - alu.last = 1; - - if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) - return r; - } break; } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) break; @@ -3128,7 +3111,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.bc = &shader->bc; ctx.shader = shader; - ctx.native_integers = true; r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, rscreen->has_compressed_msaa_texturing); -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] r600: fix textureSize queries with tbos
From: Roland Scheidegger piglit doesn't care, but I'm quite confident that the size actually bound as range should be reported and not the base size of the resource. Also, the array in the constant buffer looks overallocated by a factor of 4. For eg, also decrease the size by another factor of 2 by using the same constant slot for both buffer size (required for txq for TBOs) and the number of layers for cube arrays, as these are mutually exclusive. Could of course use some more logic and only actually do this for the samplers/images/buffers where it's required rather than for all, but ah well... (FWIW I believe the txq for TBOs would be fixable on EG without using a constant buffer by using the GET_BUFFER_RESINFO vc fetch, but for cube map arrays we'd still need the buffer as it's unfixable since the hw requires always 0 unfortunately.) --- src/gallium/drivers/r600/r600_shader.c | 18 +++--- src/gallium/drivers/r600/r600_state_common.c | 35 +--- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 6cdbfd3063..8a63621c2f 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -6955,9 +6955,9 @@ static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offs alu.op = ALU_OP1_MOV; alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; if (ctx->bc->chip_class >= EVERGREEN) { - /* channel 0 or 2 of each word */ - alu.src[0].sel += (id / 2); - alu.src[0].chan = (id % 2) * 2; + /* with eg each dword is either buf size or number of cubes */ + alu.src[0].sel += id / 4; + alu.src[0].chan = id % 4; } else { /* r600 we have them at channel 2 of the second dword */ alu.src[0].sel += (id * 2) + 1; @@ -7615,9 +7615,9 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; if (ctx->bc->chip_class >= EVERGREEN) { - /* channel 1 or 3 of each word */ - alu.src[0].sel += (id / 2); - alu.src[0].chan = ((id % 2) * 2) + 1; + /* with eg each dword is either buf size or number of cubes */ + alu.src[0].sel += id / 4; + alu.src[0].chan = id % 4; } else { /* r600 we have them at channel 2 of the second dword */ alu.src[0].sel += (id * 2) + 1; @@ -8782,9 +8782,9 @@ static int tgsi_resq(struct r600_shader_ctx *ctx) alu.op = ALU_OP1_MOV; alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; - /* channel 1 or 3 of each word */ - alu.src[0].sel += (id / 2); - alu.src[0].chan = ((id % 2) * 2) + 1; + /* with eg each dword is either buf size or number of cubes */ + alu.src[0].sel += id / 4; + alu.src[0].chan = id % 4; alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); alu.last = 1; diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index e5a5a33367..e9996cb3fa 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -902,7 +902,6 @@ struct r600_pipe_shader_selector *r600_create_shader_state_tokens(struct pipe_co unsigned pipe_shader_type) { struct r600_pipe_shader_selector *sel = CALLOC_STRUCT(r600_pipe_shader_selector); - int i; sel->type = pipe_shader_type; sel->tokens = tgsi_dup_tokens(tokens); @@ -1326,7 +1325,7 @@ static void r600_setup_buffer_constants(struct r600_context *rctx, int shader_ty samplers->views.dirty_buffer_constants = FALSE; bits = util_last_bit(samplers->views.enabled_mask); - array_size = bits * 8 * sizeof(uint32_t) * 4; + array_size = bits * 8 * sizeof(uint32_t); constants = r600_alloc_buf_consts(rctx, shader_type, array_size, &base_offset); @@ -1349,7 +1348,8 @@ static void r600_setup_buffer_constants(struct r600_context *rctx, int shader_ty } else constants[offset + 4] = 0; - constants[offset + 5] = samplers->views.views[i]->base.texture->width0 / util_format_get_blocksize(samplers->views.views[i]->base.format); + constants[offset + 5] = samplers->views.views[i]->base.u.buf.size / + util_format_get_blocksize(samplers->views.views[i]->base.format); constants[offset + 6] = samplers->views.views[
[Mesa-dev] [PATCH 1/2] gallivm: implement accurate corner behavior for textureGather with cube maps
From: Roland Scheidegger The spec says the missing texel (when we wrap around both x and y axis) should be synthesized as the average of the 3 other texels. For bilinear filtering however we instead adjusted the filter weights (because, while the complexity looks similar, there would be 4 times as many color values to fix up than weights). Obviously this could not work for gather (hence accurate corner filtering was disabled with gather). Implement this by just doing it as the spec implies - calculate the 4th texel as the average of the other 3. With gather of course there's only one color to worry about, so it's not all that many instructions neither (albeit surely the whole cube map filtering is hilariously complex). --- src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 304 ++ 1 file changed, 201 insertions(+), 103 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index def731e..571a968 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -1030,20 +1030,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef neighbors[2][2][4]; int chan, texel_index; boolean seamless_cube_filter, accurate_cube_corners; + unsigned chan_swiz = bld->static_texture_state->swizzle_r; seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE || bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) && bld->static_sampler_state->seamless_cube_map; - /* -* XXX I don't know how this is really supposed to work with gather. From GL -* spec wording (not gather specific) it sounds like the 4th missing texel -* should be an average of the other 3, hence for gather could return this. -* This is however NOT how the code here works, which just fixes up the -* weights used for filtering instead. And of course for gather there is -* no filter to tweak... -*/ - accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter && - !is_gather; + + accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter; lp_build_extract_image_sizes(bld, &bld->int_size_bld, @@ -1371,94 +1364,191 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, * as well) here. */ if (accurate_cube_corners) { - LLVMValueRef w00, w01, w10, w11, wx0, wy0; - LLVMValueRef c_weight, c00, c01, c10, c11; - LLVMValueRef have_corner, one_third, tmp; + LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f; + LLVMValueRef have_corner, one_third; - colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs"); - colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs"); - colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs"); - colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs"); + colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0"); + colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1"); + colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2"); + colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3"); have_corner = LLVMBuildLoad(builder, have_corners, ""); lp_build_if(&corner_if, bld->gallivm, have_corner); - /* - * we can't use standard 2d lerp as we need per-element weight - * in case of corners, so just calculate bilinear result as - * w00*s00 + w01*s01 + w10*s10 + w11*s11. - * (This is actually less work than using 2d lerp, 7 vs. 9 instructions, - * however calculating the weights needs another 6, so actually probably - * not slower than 2d lerp only for 4 channels as weights only need - * to be calculated once - of course fixing the weights has additional cost.) - */ - wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart); - wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart); - w00 = lp_build_mul(coord_bld, wx0, wy0); - w01 = lp_build_mul(coord_bld, s_fpart, wy0); - w10 = lp_build_mul(coord_bld, wx0, t_fpart); - w11 = lp_build_mul(coord_bld, s_fpart, t_fpart); - - /* find corner weight */ + one_third = lp_build_const_vec(bld->gallivm, coord_bld->type, +1.0f/3.0f); + + /* find corner */ c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]); - c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero); + c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, ""); c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]); -
[Mesa-dev] [PATCH 2/2] gallivm: fix an issue with NaNs with seamless cube filtering
From: Roland Scheidegger Cube texture wrapping is a bit special since the values (post face projection) always are within [0,1], so we took advantage of that and omitted some clamps. However, we can still get NaNs (either because the coords already had NaNs, or the face projection generated them), and in fact we didn't handle them quite safely. I've seen -INT_MAX + 1 been propagated through as the final int coord value, albeit I didn't observe a crash. (Not quite a coincidence, since any stride mul with -INT_MAX or -INT_MAX+1 will turn up as a small positive number - nevertheless, I'd rather not try my luck, I'm not entirely sure it can't really turn up negative neither due to seamless coord swapping, plus ifloor of a NaN is not guaranteed to return -INT_MAX by any standard. And we kill off NaNs similarly with ordinary texture wrapping too.) So kill off the NaNs by using the common max against zero method. --- src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 571a968..ff8cbf6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -1123,6 +1123,17 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, */ /* should always have normalized coords, and offsets are undefined */ assert(bld->static_sampler_state->normalized_coords); + /* + * The coords should all be between [0,1] however we can have NaNs, + * which will wreak havoc. In particular the y1_clamped value below + * can be -INT_MAX (on x86) and be propagated right through (probably + * other values might be bogus in the end too). + * So kill off the NaNs here. + */ + coords[0] = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero, + GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); + coords[1] = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero, + GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); coord = lp_build_mul(coord_bld, coords[0], flt_width_vec); /* instead of clamp, build mask if overflowed */ coord = lp_build_sub(coord_bld, coord, half); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallivm: fix texture wrapping for texture gather for mirror modes
From: Roland Scheidegger Care must be taken that all coords end up correct, the tests are very sensitive that everything is correctly rounded. This doesn't matter for bilinear filter (since picking a wrong texel with weight zero is ok), and we could also switch the per-sample coords mistakenly. While here, also optimize the coord_mirror helper a bit (we can do the mirroring directly by exploiting float rounding, no need for fixing up odd/even manually). I did not touch the mirror_clamp and mirror_clamp_to_border modes. In contrast to mirror_clamp_to_edge and mirror_repeat these are legacy modes. They are specified against old gl rules, which actually does the mirroring not per sample (so you get swapped order if the coord is in the mirrored section). I think the idea though is that they should follow the respecified mirror_clamp_to_edge rules so the order would be correct. --- src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 242 +++--- 1 file changed, 169 insertions(+), 73 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index b67a089..3605c77 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -218,34 +218,42 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, /** - * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes. + * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode. + * (Note that with pot sizes could do this much more easily post-scale + * with some bit arithmetic.) */ static LLVMValueRef lp_build_coord_mirror(struct lp_build_sample_context *bld, - LLVMValueRef coord) + LLVMValueRef coord, boolean posOnly) { struct lp_build_context *coord_bld = &bld->coord_bld; - struct lp_build_context *int_coord_bld = &bld->int_coord_bld; - LLVMValueRef fract, flr, isOdd; - - lp_build_ifloor_fract(coord_bld, coord, &flr, &fract); - /* kill off NaNs */ - /* XXX: not safe without arch rounding, fract can be anything. */ - fract = lp_build_max_ext(coord_bld, fract, coord_bld->zero, -GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); - - /* isOdd = flr & 1 */ - isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, ""); + LLVMValueRef fract; + LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5); - /* make coord positive or negative depending on isOdd */ - /* XXX slight overkill masking out sign bit is unnecessary */ - coord = lp_build_set_sign(coord_bld, fract, isOdd); + /* +* We can just use 2*(x - round(0.5*x)) to do all the mirroring, +* it all works out. (The result is in range [-1, 1.0], negative if +* the coord is in the "odd" section, otherwise positive.) +*/ - /* convert isOdd to float */ - isOdd = lp_build_int_to_float(coord_bld, isOdd); + coord = lp_build_mul(coord_bld, coord, half); + fract = lp_build_round(coord_bld, coord); + fract = lp_build_sub(coord_bld, coord, fract); + coord = lp_build_add(coord_bld, fract, fract); - /* add isOdd to coord */ - coord = lp_build_add(coord_bld, coord, isOdd); + if (posOnly) { + /* + * Theoretically it's not quite 100% accurate because the spec says + * that ultimately a scaled coord of -x.0 should map to int coord + * -x + 1 with mirroring, not -x (this does not matter for bilinear + * filtering). + */ + coord = lp_build_abs(coord_bld, coord); + /* kill off NaNs */ + /* XXX: not safe without arch rounding, fract can be anything. */ + coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero, + GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); + } return coord; } @@ -363,6 +371,11 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, } /* clamp to [0, length] */ + /* + * Unlike some other wrap modes, this should be correct for gather + * too. GL_CLAMP explicitly does this clamp on the coord prior to + * actual wrapping (which is per sample). + */ coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f); coord = lp_build_sub(coord_bld, coord, half); @@ -426,8 +439,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, offset = lp_build_int_to_float(coord_bld, offset); coord = lp_build_add(coord_bld, coord, offset); } - /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */ - /* can skip clamp (though might not work for very large coord values) */ + /* + * We don't need any clamp. Technically, for very large (pos or neg) + * (or infinite) values, clamp against [-length, length] would be + * correct, but we don't need to guarantee any specific + * result for such coords (the ifloor will be undefined, but for modes +
[Mesa-dev] [PATCH] r600: set DX10_CLAMP for compute shader too
From: Roland Scheidegger I really intended to set this for all shader stages by 3835009796166968750ff46cf209f6d4208cda86 but missed it for compute shaders (because it's in a different source file...). --- src/gallium/drivers/r600/evergreen_compute.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index 6e87539..48c4a9c 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -746,8 +746,9 @@ void evergreen_emit_cs_shader(struct r600_context *rctx, radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3); radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */ radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */ - S_0288D4_NUM_GPRS(ngpr) - | S_0288D4_STACK_SIZE(nstack)); + S_0288D4_NUM_GPRS(ngpr) | + S_0288D4_DX10_CLAMP(1) | + S_0288D4_STACK_SIZE(nstack)); radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] llvmpipe: fix snorm blending
From: Roland Scheidegger The blend math gets a bit funky due to inverse blend factors being in range [0,2] rather than [-1,1], our normalized math can't really cover this. src_alpha_saturate blend factor has a similar problem too. (Note that piglit fbo-blending-formats test is mostly useless for anything but unorm formats, since not just all src/dst values are between [0,1], but the tests are crafted in a way that the results are between [0,1] too.) v2: some formatting fixes, and fix a fairly obscure (to debug) issue with alpha-only formats (not related to snorm at all), where blend optimization would think it could simplify the blend equation if the blend factors were complementary, however was using the completely unrelated rgb blend factors instead of the alpha ones... --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 50 - src/gallium/auxiliary/gallivm/lp_bld_arit.h | 7 ++ src/gallium/drivers/llvmpipe/lp_bld_blend.c | 130 ++-- src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c | 53 ++ 4 files changed, 187 insertions(+), 53 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index a1edd34..321c6e4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -541,38 +541,38 @@ lp_build_add(struct lp_build_context *bld, assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); - if(a == bld->zero) + if (a == bld->zero) return b; - if(b == bld->zero) + if (b == bld->zero) return a; - if(a == bld->undef || b == bld->undef) + if (a == bld->undef || b == bld->undef) return bld->undef; - if(bld->type.norm) { + if (type.norm) { const char *intrinsic = NULL; - if(a == bld->one || b == bld->one) + if (!type.sign && (a == bld->one || b == bld->one)) return bld->one; if (!type.floating && !type.fixed) { if (type.width * type.length == 128) { -if(util_cpu_caps.has_sse2) { - if(type.width == 8) +if (util_cpu_caps.has_sse2) { + if (type.width == 8) intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; - if(type.width == 16) + if (type.width == 16) intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; } else if (util_cpu_caps.has_altivec) { - if(type.width == 8) + if (type.width == 8) intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; - if(type.width == 16) + if (type.width == 16) intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; } } if (type.width * type.length == 256) { -if(util_cpu_caps.has_avx2) { - if(type.width == 8) +if (util_cpu_caps.has_avx2) { + if (type.width == 8) intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; - if(type.width == 16) + if (type.width == 16) intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; } } @@ -842,38 +842,38 @@ lp_build_sub(struct lp_build_context *bld, assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); - if(b == bld->zero) + if (b == bld->zero) return a; - if(a == bld->undef || b == bld->undef) + if (a == bld->undef || b == bld->undef) return bld->undef; - if(a == b) + if (a == b) return bld->zero; - if(bld->type.norm) { + if (type.norm) { const char *intrinsic = NULL; - if(b == bld->one) + if (!type.sign && b == bld->one) return bld->zero; if (!type.floating && !type.fixed) { if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { - if(type.width == 8) + if (type.width == 8) intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; - if(type.width == 16) + if (type.width == 16) intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; } else if (util_cpu_caps.has_altivec) { - if(type.width == 8) + if (type.width == 8) intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; - if(type.width == 16) + if (type.width == 16) intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; } } if (type.width * type.length == 256) { if (util_cpu_caps.has_avx2) { - if(type.width == 8) + if (type.width == 8)
[Mesa-dev] [PATCH] llvmpipe: fix snorm blending
From: Roland Scheidegger The blend math gets a bit funky due to inverse blend factors being in range [0,2] rather than [-1,1], our normalized math can't really cover this. src_alpha_saturate blend factor has a similar problem too. (Note that piglit fbo-blending-formats test is mostly useless for anything but unorm formats, since not just all src/dst values are between [0,1], but the tests are crafted in a way that the results are between [0,1] too.) --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 10 +- src/gallium/auxiliary/gallivm/lp_bld_arit.h | 7 ++ src/gallium/drivers/llvmpipe/lp_bld_blend.c | 120 +++- src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c | 28 -- 4 files changed, 149 insertions(+), 16 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index a1edd34..628dedd 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -548,10 +548,10 @@ lp_build_add(struct lp_build_context *bld, if(a == bld->undef || b == bld->undef) return bld->undef; - if(bld->type.norm) { + if(type.norm) { const char *intrinsic = NULL; - if(a == bld->one || b == bld->one) + if(!type.sign && (a == bld->one || b == bld->one)) return bld->one; if (!type.floating && !type.fixed) { @@ -849,10 +849,10 @@ lp_build_sub(struct lp_build_context *bld, if(a == b) return bld->zero; - if(bld->type.norm) { + if(type.norm) { const char *intrinsic = NULL; - if(b == bld->one) + if(!type.sign && b == bld->one) return bld->zero; if (!type.floating && !type.fixed) { @@ -963,7 +963,7 @@ lp_build_sub(struct lp_build_context *bld, * @sa Michael Herf, The "double blend trick", May 2000, * http://www.stereopsis.com/doubleblend.html */ -static LLVMValueRef +LLVMValueRef lp_build_mul_norm(struct gallivm_state *gallivm, struct lp_type wide_type, LLVMValueRef a, LLVMValueRef b) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 2a4137a..f5b2800 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -71,6 +71,13 @@ lp_build_sub(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + +LLVMValueRef +lp_build_mul_norm(struct gallivm_state *gallivm, + struct lp_type wide_type, + LLVMValueRef a, + LLVMValueRef b); + LLVMValueRef lp_build_mul(struct lp_build_context *bld, LLVMValueRef a, diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/src/gallium/drivers/llvmpipe/lp_bld_blend.c index 1feb415..bd886dc 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_blend.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.c @@ -35,6 +35,7 @@ #include "gallivm/lp_bld_swizzle.h" #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_pack.h" #include "lp_bld_blend.h" @@ -86,6 +87,56 @@ lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor) /** + * Whether this is a inverse blend factor + */ +static inline boolean +is_inverse_factor(unsigned factor) +{ + return factor > 0x11; +} + + +/** + * Calculates the (expanded to wider type) multiplication + * of 2 normalized numbers. + */ +static void +lp_build_mul_norm_expand(struct lp_build_context *bld, + LLVMValueRef a, LLVMValueRef b, + LLVMValueRef *resl, LLVMValueRef *resh, + boolean signedness_differs) +{ + const struct lp_type type = bld->type; + struct lp_type wide_type = lp_wider_type(type); + struct lp_type wide_type2 = wide_type; + struct lp_type type2 = type; + LLVMValueRef al, ah, bl, bh; + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + assert(!type.floating && !type.fixed && type.norm); + + if(a == bld->zero || b == bld->zero) { + LLVMValueRef zero = LLVMConstNull(lp_build_vec_type(bld->gallivm, wide_type)); + *resl = zero; + *resh = zero; + return; + } + + if (signedness_differs) { + type2.sign = !type.sign; + wide_type2.sign = !wide_type2.sign; + } + + lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); + lp_build_unpack2_native(bld->gallivm, type2, wide_type2, b, &bl, &bh); + + *resl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); + *resh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); +} + + +/** * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml */ LLVMValueRef @@ -192,9 +243,72 @@ lp_build_blend(struct lp_build_context *bld, if (optimise_only) return NULL; - src_term = lp_build_mul(bld, src, src_factor); - dst_term = lp_build_mul(bld, dst, dst_factor); - return lp_bui
[Mesa-dev] [PATCH 3/5] r600: use ieee version of rcp
From: Roland Scheidegger r600 used the clamped version for rcp, whereas both evergreen and cayman used the ieee version. I don't know why that discrepancy exists (it does so since day 1) but there does not seem to be a valid reason for this, so make it consistent. This seems now safer than before the previous commit (using the dx10 clamp bit). Note that rsq still uses clamped version (as before even though the table may have suggested otherwise for evergreen) for r600/eg, but not for cayman. Will be changed separately for better regression tracking... --- src/gallium/drivers/r600/r600_shader.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index e9054c4fbb..2ece2210a6 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -8830,11 +8830,7 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, - /* XXX: -* For state trackers other than OpenGL, we'll want to use -* _RECIP_IEEE instead. -*/ - [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate}, + [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, @@ -9035,7 +9031,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, - [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq}, + [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/5] r600: use ieee version of rsq
From: Roland Scheidegger Both r600 and evergreen used the clamped version, whereas cayman used the ieee one. I don't think there's a valid reason for this discrepancy, so let's switch to the ieee version for r600 and evergreen too, since we generally want to stick to ieee arithmetic. With this, behavior for both rcp and rsq should now be the same for all of r600, eg, cm, all using ieee versions (albeit note rsq retains the abs behavior for everybody, which may not be a good idea ultimately). --- src/gallium/drivers/r600/r600_shader.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 2ece2210a6..3f42654d13 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -4796,11 +4796,7 @@ static int tgsi_rsq(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - /* XXX: -* For state trackers other than OpenGL, we'll want to use -* _RECIPSQRT_IEEE instead. -*/ - alu.op = ALU_OP1_RECIPSQRT_CLAMPED; + alu.op = ALU_OP1_RECIPSQRT_IEEE; for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/5] r600: use min_dx10/max_dx10 instead of min/max
From: Roland Scheidegger I believe this is the safe thing to do, especially ever since the driver actually generates NaNs for muls too. The ISA docs are not very helpful here, however the dx10 versions will pick a non-nan result over a NaN one (this is also the ieee754 behavior), whereas the non-dx10 ones will pick the NaN (verified by newly changed piglit isinf-and-isnan test). Other "modern" drivers will most likely do the same. This was shown to make some difference for bug 103544, albeit it is not required to fix it. --- src/gallium/drivers/r600/r600_shader.c | 13 +++-- src/gallium/drivers/r600/sb/sb_expr.cpp | 2 ++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 188fbc9d47..e9054c4fbb 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -8844,8 +8844,9 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */ + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, @@ -9042,8 +9043,8 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, @@ -9265,8 +9266,8 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 3dd3a4815b..7a5d62c8e8 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -753,7 +753,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) { n.bc.src[0].abs == n.bc.src[1].abs) { switch (n.bc.op) { case ALU_OP2_MIN: // (MIN x, x) => (MOV x) + case ALU_OP2_MIN_DX10: case ALU_OP2_MAX: + case ALU_OP2_MAX_DX10: convert_to_mov(n, v0, n.bc.src[0].neg, n.bc.src[0].abs); return fold_alu_op1(n); case ALU_OP2_ADD: // (ADD x, x) => (MUL x, 2) -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 5/5] r600: set the number type correctly for float rts in cb setup
From: Roland Scheidegger Float rts were always set as unorm instead of float. Not sure of the consequences, but at least it looks like the blend clamp would have been enabled, which is against the rules (only eg really bothered to even attempt to specify this correctly, r600 always used clamp anyway). Albeit r600 (not r700) setup still looks bugged to me due to never setting BLEND_FLOAT32 which must be set according to docs... Not sure if the hw really cares, no piglit change (on eg/juniper). --- src/gallium/drivers/r600/evergreen_state.c | 7 ++- src/gallium/drivers/r600/r600_state.c | 10 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index ef323bf4f6..e724cb157f 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1042,7 +1042,7 @@ static void evergreen_set_color_surface_buffer(struct r600_context *rctx, } } ntype = V_028C70_NUMBER_UNORM; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) ntype = V_028C70_NUMBER_SRGB; else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { if (desc->channel[i].normalized) @@ -1054,7 +1054,10 @@ static void evergreen_set_color_surface_buffer(struct r600_context *rctx, ntype = V_028C70_NUMBER_UNORM; else if (desc->channel[i].pure_integer) ntype = V_028C70_NUMBER_UINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; } + pitch = (pitch / 8) - 1; color->pitch = S_028C64_PITCH_TILE_MAX(pitch); @@ -1180,6 +1183,8 @@ static void evergreen_set_color_surface_common(struct r600_context *rctx, ntype = V_028C70_NUMBER_UNORM; else if (desc->channel[i].pure_integer) ntype = V_028C70_NUMBER_UINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; } if (R600_BIG_ENDIAN) diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index db3d6db70b..f024987a30 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -817,7 +817,7 @@ static void r600_init_color_surface(struct r600_context *rctx, unsigned offset; const struct util_format_description *desc; int i; - bool blend_bypass = 0, blend_clamp = 1, do_endian_swap = FALSE; + bool blend_bypass = 0, blend_clamp = 0, do_endian_swap = FALSE; if (rtex->db_compatible && !r600_can_sample_zs(rtex, false)) { r600_init_flushed_depth_texture(&rctx->b.b, surf->base.texture, NULL); @@ -869,6 +869,8 @@ static void r600_init_color_surface(struct r600_context *rctx, ntype = V_0280A0_NUMBER_UNORM; else if (desc->channel[i].pure_integer) ntype = V_0280A0_NUMBER_UINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_0280A0_NUMBER_FLOAT; } if (R600_BIG_ENDIAN) @@ -883,6 +885,11 @@ static void r600_init_color_surface(struct r600_context *rctx, endian = r600_colorformat_endian_swap(format, do_endian_swap); + /* blend clamp should be set for all NORM/SRGB types */ + if (ntype == V_0280A0_NUMBER_UNORM || ntype == V_0280A0_NUMBER_SNORM || + ntype == V_0280A0_NUMBER_SRGB) + blend_clamp = 1; + /* set blend bypass according to docs if SINT/UINT or 8/24 COLOR variants */ if (ntype == V_0280A0_NUMBER_UINT || ntype == V_0280A0_NUMBER_SINT || @@ -916,6 +923,7 @@ static void r600_init_color_surface(struct r600_context *rctx, ntype != V_0280A0_NUMBER_UINT && ntype != V_0280A0_NUMBER_SINT) && G_0280A0_BLEND_CLAMP(color_info) && + /* XXX this condition is always true since BLEND_FLOAT32 is never set (bug?). */ !G_0280A0_BLEND_FLOAT32(color_info)) { color_info |= S_0280A0_SOURCE_FORMAT(V_0280A0_EXPORT_NORM); surf->export_16bpc = true; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/5] r600: use DX10_CLAMP bit in shader setup
From: Roland Scheidegger The docs are not very concise in what this really does, however both Alex Deucher and Nicolai Hähnle suggested this only really affects instructions using the CLAMP output modifier, and I've confirmed that with the newly changed piglit isinf_and_isnan test. So, with this bit set, if an instruction has the CLAMP modifier bit (which clamps to [0,1]) set, then NaNs will be converted to zero, otherwise the result will be NaN. D3D10 would require this, glsl doesn't have modifiers (with mesa clamp(x,0,1) would get converted to such a modifier) coupled with a whatever-floats-your-boat specified NaN behavior, but the clamp behavior should probably always be used (this also matches what a decomposition into min(1.0, max(x, 0.0)) would do, if min/max also adhere to the ieee spec of picking the non-nan result). Some apps may in fact rely on this, as this prevents misrenderings in This War of Mine since using ieee muls (ce7a045feeef8cad155f1c9aa07f166e146e3d00), without having to use clamped rcp opcode, which would also fix this bug there. radeonsi also seems to set this bit nowadays if I see that righ (albeit the llvm amdgpu code comment now says "Make clamp modifier on NaN input returns 0" instead of "Do not clamp NAN to 0" since it was changed, which also looks a bit misleading). v2: set it in all shader stages. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103544 --- src/gallium/drivers/r600/evergreen_state.c | 6 ++ src/gallium/drivers/r600/r600_state.c | 9 + 2 files changed, 15 insertions(+) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 96eb35a981..ef323bf4f6 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -3235,6 +3235,7 @@ void evergreen_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader r600_store_value(cb, /* R_028844_SQ_PGM_RESOURCES_PS */ S_028844_NUM_GPRS(rshader->bc.ngpr) | S_028844_PRIME_CACHE_ON_DRAW(1) | +S_028844_DX10_CLAMP(1) | S_028844_STACK_SIZE(rshader->bc.nstack)); /* After that, the NOP relocation packet must be emitted (shader->bo, RADEON_USAGE_READ). */ @@ -3255,6 +3256,7 @@ void evergreen_update_es_state(struct pipe_context *ctx, struct r600_pipe_shader r600_store_context_reg(cb, R_028890_SQ_PGM_RESOURCES_ES, S_028890_NUM_GPRS(rshader->bc.ngpr) | + S_028890_DX10_CLAMP(1) | S_028890_STACK_SIZE(rshader->bc.nstack)); r600_store_context_reg(cb, R_02888C_SQ_PGM_START_ES, shader->bo->gpu_address >> 8); @@ -3317,6 +3319,7 @@ void evergreen_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader r600_store_context_reg(cb, R_028878_SQ_PGM_RESOURCES_GS, S_028878_NUM_GPRS(rshader->bc.ngpr) | + S_028878_DX10_CLAMP(1) | S_028878_STACK_SIZE(rshader->bc.nstack)); r600_store_context_reg(cb, R_028874_SQ_PGM_START_GS, shader->bo->gpu_address >> 8); @@ -3357,6 +3360,7 @@ void evergreen_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader S_0286C4_VS_EXPORT_COUNT(nparams - 1)); r600_store_context_reg(cb, R_028860_SQ_PGM_RESOURCES_VS, S_028860_NUM_GPRS(rshader->bc.ngpr) | + S_028860_DX10_CLAMP(1) | S_028860_STACK_SIZE(rshader->bc.nstack)); if (rshader->vs_position_window_space) { r600_store_context_reg(cb, R_028818_PA_CL_VTE_CNTL, @@ -3391,6 +3395,7 @@ void evergreen_update_hs_state(struct pipe_context *ctx, struct r600_pipe_shader r600_init_command_buffer(cb, 32); r600_store_context_reg(cb, R_0288BC_SQ_PGM_RESOURCES_HS, S_0288BC_NUM_GPRS(rshader->bc.ngpr) | + S_0288BC_DX10_CLAMP(1) | S_0288BC_STACK_SIZE(rshader->bc.nstack)); r600_store_context_reg(cb, R_0288B8_SQ_PGM_START_HS, shader->bo->gpu_address >> 8); @@ -3404,6 +3409,7 @@ void evergreen_update_ls_state(struct pipe_context *ctx, struct r600_pipe_shader r600_init_command_buffer(cb, 32); r600_store_context_reg(cb, R_0288D4_SQ_PGM_RESOURCES_LS, S_0288D4_NUM_GPRS(rshader->bc.ngpr) | + S_0288D4_DX10_CLAMP(1) | S_0288D4_STACK_SIZE(rshader->bc.nstack)); r600_store_context_reg(cb, R_0288D0_SQ_PGM_START_LS, shader->bo->gpu_address >> 8); diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r
[Mesa-dev] [PATCH 1/4] r600: use min_dx10/max_dx10 instead of min/max
From: Roland Scheidegger I believe this is the safe thing to do, especially ever since the driver actually generates NaNs for muls too. Albeit since the radeon ISA docs are inaccurate/wrong there, I'm not entirely sure what the non-dx10 versions do, but (as required by dx10) the dx10 versions should pick a non-nan source over a nan source. Other drivers presumably do the same (radeonsi, llvmpipe). This was shown to make some difference for bug 103544, albeit it is not required to fix it. --- src/gallium/drivers/r600/r600_shader.c | 12 ++-- src/gallium/drivers/r600/sb/sb_expr.cpp | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 188fbc9d47..6a755bb3fd 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -8844,8 +8844,8 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, @@ -9042,8 +9042,8 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, @@ -9265,8 +9265,8 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 3dd3a4815b..7a5d62c8e8 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -753,7 +753,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) { n.bc.src[0].abs == n.bc.src[1].abs) { switch (n.bc.op) { case ALU_OP2_MIN: // (MIN x, x) => (MOV x) + case ALU_OP2_MIN_DX10: case ALU_OP2_MAX: + case ALU_OP2_MAX_DX10: convert_to_mov(n, v0, n.bc.src[0].neg, n.bc.src[0].abs); return fold_alu_op1(n); case ALU_OP2_ADD: // (ADD x, x) => (MUL x, 2) -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 4/4] r600: set the number type correctly for float rts in cb setup
From: Roland Scheidegger Float rts were always set as unorm instead of float. Not sure of the consequences, but at least it looks like the blend clamp would have been enabled, which is against the rules (only eg really bothered to even attempt to specify this correctly, r600 always used clamp anyway). Albeit r600 (not r700) setup still looks bugged to me due to never setting BLEND_FLOAT32 which must be set according to docs... Not sure if the hw really cares, no piglit change. --- src/gallium/drivers/r600/evergreen_state.c | 7 ++- src/gallium/drivers/r600/r600_state.c | 10 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index ddd59dc0b5..ba08f38f8c 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1042,7 +1042,7 @@ static void evergreen_set_color_surface_buffer(struct r600_context *rctx, } } ntype = V_028C70_NUMBER_UNORM; - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) ntype = V_028C70_NUMBER_SRGB; else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { if (desc->channel[i].normalized) @@ -1054,7 +1054,10 @@ static void evergreen_set_color_surface_buffer(struct r600_context *rctx, ntype = V_028C70_NUMBER_UNORM; else if (desc->channel[i].pure_integer) ntype = V_028C70_NUMBER_UINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; } + pitch = (pitch / 8) - 1; color->pitch = S_028C64_PITCH_TILE_MAX(pitch); @@ -1180,6 +1183,8 @@ static void evergreen_set_color_surface_common(struct r600_context *rctx, ntype = V_028C70_NUMBER_UNORM; else if (desc->channel[i].pure_integer) ntype = V_028C70_NUMBER_UINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; } if (R600_BIG_ENDIAN) diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index c0d0b1667a..0bda8d5b3f 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -817,7 +817,7 @@ static void r600_init_color_surface(struct r600_context *rctx, unsigned offset; const struct util_format_description *desc; int i; - bool blend_bypass = 0, blend_clamp = 1, do_endian_swap = FALSE; + bool blend_bypass = 0, blend_clamp = 0, do_endian_swap = FALSE; if (rtex->db_compatible && !r600_can_sample_zs(rtex, false)) { r600_init_flushed_depth_texture(&rctx->b.b, surf->base.texture, NULL); @@ -869,6 +869,8 @@ static void r600_init_color_surface(struct r600_context *rctx, ntype = V_0280A0_NUMBER_UNORM; else if (desc->channel[i].pure_integer) ntype = V_0280A0_NUMBER_UINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_0280A0_NUMBER_FLOAT; } if (R600_BIG_ENDIAN) @@ -883,6 +885,11 @@ static void r600_init_color_surface(struct r600_context *rctx, endian = r600_colorformat_endian_swap(format, do_endian_swap); + /* blend clamp should be set for all NORM/SRGB types */ + if (ntype == V_0280A0_NUMBER_UNORM || ntype == V_0280A0_NUMBER_SNORM || + ntype == V_0280A0_NUMBER_SRGB) + blend_clamp = 1; + /* set blend bypass according to docs if SINT/UINT or 8/24 COLOR variants */ if (ntype == V_0280A0_NUMBER_UINT || ntype == V_0280A0_NUMBER_SINT || @@ -916,6 +923,7 @@ static void r600_init_color_surface(struct r600_context *rctx, ntype != V_0280A0_NUMBER_UINT && ntype != V_0280A0_NUMBER_SINT) && G_0280A0_BLEND_CLAMP(color_info) && + /* XXX this condition is always true since BLEND_FLOAT32 is never set (bug?). */ !G_0280A0_BLEND_FLOAT32(color_info)) { color_info |= S_0280A0_SOURCE_FORMAT(V_0280A0_EXPORT_NORM); surf->export_16bpc = true; -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/4] r600: use mysterious DX10_CLAMP bit in pixel shader setup
From: Roland Scheidegger I don't know what this bit really does. The docs are somewhere between misleading and wrong however, as at least the newer ones (that bit exists with GCN as well) imply all NaNs would get converted to zeros, which is definitely NOT the case (and that would not be dx10 compliant neither), the r600 ones are also talking about "dx10 style" vs "dx9 style" clamp, whatever that means for dx9... Makes no difference at all with piglit's isinf-and-isnan tests, so very obviously NaNs are still generated just fine. radeonsi also seems to set this bit nowadays (the llvm amdgpu code comment now says "Make clamp modifier on NaN input returns 0" instead of "Do not clamp NAN to 0" since it was changed). This prevents misrenderings in This War of Mine since using ieee muls (ce7a045feeef8cad155f1c9aa07f166e146e3d00), without having to use clamped rcp opcode, which would also fix this. AMD, it would be really really nice if there would be useful/correct/accurate information about this bit... The bit can be set for all shader stages, and maybe it should be set but I really have no idea... Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103544 --- src/gallium/drivers/r600/evergreen_state.c | 1 + src/gallium/drivers/r600/r600_state.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 96eb35a981..ddd59dc0b5 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -3235,6 +3235,7 @@ void evergreen_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader r600_store_value(cb, /* R_028844_SQ_PGM_RESOURCES_PS */ S_028844_NUM_GPRS(rshader->bc.ngpr) | S_028844_PRIME_CACHE_ON_DRAW(1) | +S_028844_DX10_CLAMP(1) | S_028844_STACK_SIZE(rshader->bc.nstack)); /* After that, the NOP relocation packet must be emitted (shader->bo, RADEON_USAGE_READ). */ diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index c21e8dabb1..c0d0b1667a 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -2548,6 +2548,7 @@ void r600_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader *sha r600_store_context_reg_seq(cb, R_028850_SQ_PGM_RESOURCES_PS, 2); r600_store_value(cb, /* R_028850_SQ_PGM_RESOURCES_PS*/ S_028850_NUM_GPRS(rshader->bc.ngpr) | +S_028850_DX10_CLAMP(1) | S_028850_STACK_SIZE(rshader->bc.nstack) | S_028850_UNCACHED_FIRST_INST(ufi)); r600_store_value(cb, exports_ps); /* R_028854_SQ_PGM_EXPORTS_PS */ -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 3/4] r600: use ieee version of rcp
From: Roland Scheidegger r600 used the clamped version for rcp, whereas both evergreen and cayman used the ieee version. I don't know why that discrepancy exists (it does so since day 1) but there does not seem to be a valid reason for this, so make it consistent. This seems now safer than before the previous commit (using the mystery dx10 clamp). Note that rsq still uses clamped version (as before even though the table may have suggested otherwise for evergreen) for r600/eg, but not for cayman. I just don't feel lucky enough to change this (it should also be noted r600 supports sqrt natively, which is always ieee, therefore might not really see rsqrt with glsl often presumably). Compile tested only... --- src/gallium/drivers/r600/r600_shader.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 6a755bb3fd..628c33787e 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -8830,11 +8830,7 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, - /* XXX: -* For state trackers other than OpenGL, we'll want to use -* _RECIP_IEEE instead. -*/ - [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate}, + [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, @@ -9034,7 +9030,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, - [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq}, + [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 2/2] r600: use the clamped versions of rcp/rsq for eg/cayman.
From: Roland Scheidegger r600 already used the clamped versions, but for some reason this was different to eg/cayman. (Note that it has been different since essentially forever, 7 years, since df62338c491f2cace1a48f99de78e83b5edd82fd in particular, which changed this for r600 but not eg (cayman wasn't supported back then, but probably copied this from the eg part later). The commit does not mention any reason why this difference should exist.) This seems a bit unfortunate, since it would be nice to use ieee arithmetic, I have no idea what this could potentially break and no idea if it really makes sense going back to legacy-style rcp/rsq... This however prevents misrenderings in This War of Mine since using ieee muls (ce7a045feeef8cad155f1c9aa07f166e146e3d00), albeit strictly speaking only rcp_clamped is necessary for this. It seems likely the root cause is some x * rcp(y) calculation where both x and y evaluate to 0. Albeit it apparently works with other drivers, not sure what's up with that... Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103544 --- src/gallium/drivers/r600/r600_shader.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 6a755bb3fd..62fc4da901 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -9033,8 +9033,12 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, - [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, - [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq}, + /* XXX: +* For state trackers other than OpenGL, we'll want to use +* _RECIP_IEEE/_RECIPSQRT_IEEE instead. +*/ + [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate}, + [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_CLAMPED, tgsi_rsq}, [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, @@ -9256,8 +9260,12 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, - [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, - [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, + /* XXX: +* For state trackers other than OpenGL, we'll want to use +* _RECIP_IEEE/_RECIPSQRT_IEEE instead. +*/ + [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, cayman_emit_float_instr}, + [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_CLAMPED, cayman_emit_float_instr}, [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH 1/2] r600: use min_dx10/max_dx10 instead of min/max_dx10
From: Roland Scheidegger I believe this is the safe thing to do, especially ever since the driver actually generates NaNs for muls too. Albeit since the radeon ISA docs are inaccurate/wrong there, I'm not entirely sure what the non-dx10 versions do, but (as required by dx10) the dx10 versions should pick a non-nan source over a nan source. Other drivers presumably do the same (radeonsi, llvmpipe). This was shown to make some difference for bug 103544, albeit it is not required to fix it. --- src/gallium/drivers/r600/r600_shader.c | 12 ++-- src/gallium/drivers/r600/sb/sb_expr.cpp | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 188fbc9d47..6a755bb3fd 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -8844,8 +8844,8 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, @@ -9042,8 +9042,8 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, @@ -9265,8 +9265,8 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, - [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2}, - [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2}, + [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, + [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 3dd3a4815b..7a5d62c8e8 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -753,7 +753,9 @@ bool expr_handler::fold_alu_op2(alu_node& n) { n.bc.src[0].abs == n.bc.src[1].abs) { switch (n.bc.op) { case ALU_OP2_MIN: // (MIN x, x) => (MOV x) + case ALU_OP2_MIN_DX10: case ALU_OP2_MAX: + case ALU_OP2_MAX_DX10: convert_to_mov(n, v0, n.bc.src[0].neg, n.bc.src[0].abs); return fold_alu_op1(n); case ALU_OP2_ADD: // (ADD x, x) => (MUL x, 2) -- 2.12.3 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] docs: Fix GL_MESA_program_debug enums
From: Roland Scheidegger 13b303ff9265b89bdd9100e32f905e9cdadfad81 added the actual enums but didn't remove the already existing ones. (And also duplicated the "fragment" names instead of using the "vertex" names.) --- docs/specs/enums.txt | 26 -- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/docs/specs/enums.txt b/docs/specs/enums.txt index 4b0485f..7b5709b 100644 --- a/docs/specs/enums.txt +++ b/docs/specs/enums.txt @@ -46,14 +46,14 @@ GL_MESA_shader_debug.spec: (obsolete) GL_DEBUG_ASSERT_MESA 0x875B GL_MESA_program_debug: (obsolete) - GL_FRAGMENT_PROGRAM_CALLBACK_MESA 0x - GL_VERTEX_PROGRAM_CALLBACK_MESA0x - GL_FRAGMENT_PROGRAM_POSITION_MESA 0x - GL_VERTEX_PROGRAM_POSITION_MESA0x - GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA 0x - GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA 0x - GL_VERTEX_PROGRAM_CALLBACK_FUNC_MESA 0x - GL_VERTEX_PROGRAM_CALLBACK_DATA_MESA 0x + GL_FRAGMENT_PROGRAM_POSITION_MESA 0x8BB0 + GL_FRAGMENT_PROGRAM_CALLBACK_MESA 0x8BB1 + GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA 0x8BB2 + GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA 0x8BB3 + GL_VERTEX_PROGRAM_POSITION_MESA 0x8BB4 + GL_VERTEX_PROGRAM_CALLBACK_MESA 0x8BB5 + GL_VERTEX_PROGRAM_CALLBACK_FUNC_MESA0x8BB6 + GL_VERTEX_PROGRAM_CALLBACK_DATA_MESA0x8BB7 GL_MESAX_texture_stack: GL_TEXTURE_1D_STACK_MESAX0x8759 @@ -63,16 +63,6 @@ GL_MESAX_texture_stack: GL_TEXTURE_1D_STACK_BINDING_MESAX0x875D GL_TEXTURE_2D_STACK_BINDING_MESAX0x875E -GL_MESA_program_debug - GL_FRAGMENT_PROGRAM_POSITION_MESA 0x8BB0 - GL_FRAGMENT_PROGRAM_CALLBACK_MESA 0x8BB1 - GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA 0x8BB2 - GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA 0x8BB3 - GL_FRAGMENT_PROGRAM_POSITION_MESA 0x8BB4 - GL_FRAGMENT_PROGRAM_CALLBACK_MESA 0x8BB5 - GL_FRAGMENT_PROGRAM_CALLBACK_FUNC_MESA 0x8BB6 - GL_FRAGMENT_PROGRAM_CALLBACK_DATA_MESA 0x8BB7 - GL_MESA_tile_raster_order GL_TILE_RASTER_ORDER_FIXED_MESA 0x8BB8 GL_TILE_RASTER_ORDER_INCREASING_X_MESA 0x8BB9 -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] draw: don't cull tris with zero aera
From: Roland Scheidegger Culling tris with zero aera seems like a great idea, but apparently with fill mode line (and point) we're supposed to draw them, at least some tests for some other state tracker complained otherwise. Such tris also always seem to be back facing (not sure if this can be inferred from anything, since in a mathematical sense it cannot really be determined), so make sure to account for this when filling in the face information. (For solid tris, this is of course unnecessary, drivers will throw the tris away later in any case.) --- src/gallium/auxiliary/draw/draw_pipe_cull.c | 10 ++ src/gallium/auxiliary/draw/draw_pipe_unfilled.c | 5 ++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c index 3e8e458..3863485 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_cull.c +++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c @@ -181,6 +181,16 @@ static void cull_tri( struct draw_stage *stage, /* triangle is not culled, pass to next stage */ stage->next->tri( stage->next, header ); } + } else { + /* + * With zero aera, this is back facing (because the spec says + * it's front facing if sign is positive?). + * Some apis apparently do not allow us to cull zero aera tris + * here, in case of fill mode line (which is rather lame). + */ + if ((PIPE_FACE_BACK & cull_stage(stage)->cull_face) == 0) { +stage->next->tri( stage->next, header ); + } } } } diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c index c465c75..f39db0e 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c +++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c @@ -63,10 +63,9 @@ inject_front_face_info(struct draw_stage *stage, struct prim_header *header) { struct unfilled_stage *unfilled = unfilled_stage(stage); - unsigned ccw = header->det < 0.0; boolean is_front_face = ( - (stage->draw->rasterizer->front_ccw && ccw) || - (!stage->draw->rasterizer->front_ccw && !ccw)); + (stage->draw->rasterizer->front_ccw && header->det < 0.0f) || + (!stage->draw->rasterizer->front_ccw && header->det > 0.0f)); int slot = unfilled->face_slot; unsigned i; -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] gallium/util: remove some block alignment assertions
From: Roland Scheidegger These assertions were revisited a couple of times in the past, and they still weren't quite right. The problem I was seeing (with some other state tracker) was a copy between two 512x512 s3tc textures, but from mip level 0 to mip level 8. Therefore, the destination has only size 2x2 (not a full block), so the box width/height was only 2, causing the assertion to trigger for src alignment. As far as I can tell, such a copy is completely legal, and because a correct assertion would get ridiculously complicated just get rid of it for good. --- src/gallium/auxiliary/util/u_surface.c | 8 1 file changed, 8 deletions(-) diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c index 5abf966..0a79a25 100644 --- a/src/gallium/auxiliary/util/u_surface.c +++ b/src/gallium/auxiliary/util/u_surface.c @@ -324,16 +324,8 @@ util_resource_copy_region(struct pipe_context *pipe, /* check that region boxes are block aligned */ assert(src_box.x % src_bw == 0); assert(src_box.y % src_bh == 0); - assert(src_box.width % src_bw == 0 || - src_box.x + src_box.width == u_minify(src->width0, src_level)); - assert(src_box.height % src_bh == 0 || - src_box.y + src_box.height == u_minify(src->height0, src_level)); assert(dst_box.x % dst_bw == 0); assert(dst_box.y % dst_bh == 0); - assert(dst_box.width % dst_bw == 0 || - dst_box.x + dst_box.width == u_minify(dst->width0, dst_level)); - assert(dst_box.height % dst_bh == 0 || - dst_box.y + dst_box.height == u_minify(dst->height0, dst_level)); /* check that region boxes are not out of bounds */ assert(src_box.x + src_box.width <= u_minify(src->width0, src_level)); -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [PATCH] tgsi: fix tgsi_util_get_inst_usage_mask
From: Roland Scheidegger The logic for handling shadow coords was completely broken. Fixes be3ab867bd444594f9d9e0f8e59d305d15769afd. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103265 --- src/gallium/auxiliary/tgsi/tgsi_util.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c index be8bcdf..cfce590 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.c +++ b/src/gallium/auxiliary/tgsi/tgsi_util.c @@ -292,17 +292,17 @@ tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst, case TGSI_OPCODE_TXL2: case TGSI_OPCODE_LODQ: case TGSI_OPCODE_TG4: { - unsigned dim_layer_shadow = + unsigned dim_layer = tgsi_util_get_texture_coord_dim(inst->Texture.Texture); - unsigned dim_layer, dim; + unsigned dim_layer_shadow, dim; - /* Remove shadow. */ + /* Add shadow. */ if (tgsi_is_shadow_target(inst->Texture.Texture)) { - dim_layer = dim_layer_shadow - 1; + dim_layer_shadow = dim_layer + 1; if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D) -dim_layer = 1; +dim_layer_shadow = 3; } else { - dim_layer = dim_layer_shadow; + dim_layer_shadow = dim_layer; } /* Remove layer. */ -- 2.7.4 ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev