Re: [Mesa-dev] [r600/sfn] Compilation error (and some warnings) - maybe to old LLVM git version, here?
Am Samstag, den 28.11.2020, 02:46 +0100 schrieb Dieter Nützel: > [48/179] Compiling C++ object > src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o > FAILED: > src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o > ccache c++ -Isrc/gallium/drivers/r600/libr600.a.p > -Isrc/gallium/drivers/r600 -I../src/gallium/drivers/r600 -Isrc > -I../src > -Isrc/mapi -I../src/mapi -Isrc/mesa -I../src/mesa -Iinclude > -I../include > -Isrc/compiler -I../src/compiler -I../src/gallium/include > -Isrc/gallium/auxiliary -I../src/gallium/auxiliary -Isrc/amd/common > -I../src/amd/common -Isrc/gallium/drivers -I../src/gallium/drivers > -Isrc/compiler/nir -I../src/compiler/nir -Isrc/util -I../src/util > -I/usr/include/libdrm -fvisibility=hidden -fdiagnostics-color=always > -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch > -Wnon-virtual-dtor -std=c++14 -O3 -ffunction-sections -fdata- > sections > '-DPACKAGE_VERSION="21.0.0-devel"' > '-DPACKAGE_BUGREPORT=" > https://gitlab.freedesktop.org/mesa/mesa/-/issues;' > -DUSE_ELF_TLS -DHAVE_ST_VDPAU -DENABLE_ST_OMX_BELLAGIO=0 > -DENABLE_ST_OMX_TIZONIA=0 -DHAVE_X11_PLATFORM -DHAVE_XCB_PLATFORM > -DGLX_INDIRECT_RENDERING -DGLX_DIRECT_RENDERING -DGLX_USE_DRM > -DHAVE_DRM_PLATFORM -DENABLE_SHADER_CACHE -DHAVE___BUILTIN_BSWAP32 > -DHAVE___BUILTIN_BSWAP64 -DHAVE___BUILTIN_CLZ -DHAVE___BUILTIN_CLZLL > -DHAVE___BUILTIN_CTZ -DHAVE___BUILTIN_EXPECT -DHAVE___BUILTIN_FFS > -DHAVE___BUILTIN_FFSLL -DHAVE___BUILTIN_POPCOUNT > -DHAVE___BUILTIN_POPCOUNTLL -DHAVE___BUILTIN_UNREACHABLE > -DHAVE_FUNC_ATTRIBUTE_CONST -DHAVE_FUNC_ATTRIBUTE_FLATTEN > -DHAVE_FUNC_ATTRIBUTE_MALLOC -DHAVE_FUNC_ATTRIBUTE_PURE > -DHAVE_FUNC_ATTRIBUTE_UNUSED > -DHAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT > -DHAVE_FUNC_ATTRIBUTE_WEAK -DHAVE_FUNC_ATTRIBUTE_FORMAT > -DHAVE_FUNC_ATTRIBUTE_PACKED -DHAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL > -DHAVE_FUNC_ATTRIBUTE_ALIAS -DHAVE_FUNC_ATTRIBUTE_NORETURN > -DHAVE_FUNC_ATTRIBUTE_VISIBILITY -DHAVE_UINT128 -DUSE_SSE41 > -DUSE_GCC_ATOMIC_BUILTINS -DUSE_X86_64_ASM -DMAJOR_IN_SYSMACROS > -DHAVE_LINUX_FUTEX_H -DHAVE_ENDIAN_H -DHAVE_DLFCN_H > -DHAVE_EXECINFO_H > -DHAVE_SYS_SHM_H -DHAVE_CET_H -DHAVE_STRTOF -DHAVE_MKOSTEMP > -DHAVE_TIMESPEC_GET -DHAVE_MEMFD_CREATE -DHAVE_RANDOM_R -DHAVE_FLOCK > -DHAVE_STRTOK_R -DHAVE_GETRANDOM -DHAVE_PROGRAM_INVOCATION_NAME > -DHAVE_POSIX_MEMALIGN -DHAVE_DIRENT_D_TYPE -DHAVE_STRTOD_L > -DHAVE_DLADDR > -DHAVE_DL_ITERATE_PHDR -DHAVE_ZLIB -DHAVE_ZSTD -DHAVE_PTHREAD > -DHAVE_PTHREAD_SETAFFINITY -DHAVE_LIBDRM -DLLVM_AVAILABLE > '-DMESA_LLVM_VERSION_STRING="12.0.0"' -DLLVM_IS_SHARED=1 > -DUSE_LIBGLVND=1 -DHAVE_LIBUNWIND -DHAVE_DRI3 -DHAVE_DRI3_MODIFIERS > -DHAVE_LIBSENSORS=1 -Werror=return-type -Werror=empty-body > -Wno-non-virtual-dtor -Wno-missing-field-initializers > -Wno-format-truncation -fno-math-errno -fno-trapping-math > -flifetime-dse=1 -Werror=format -Wformat-security -fPIC -pthread > -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -D__STDC_LIMIT_MACROS > -D__STDC_CONSTANT_MACROS -MD -MQ > src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o > -MF > src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o.d > -o > src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o > -c > ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp > ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp: In function > ‘unsigned int r600::barycentric_ij_index(nir_intrinsic_instr*)’: > ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp:102:4: > error: > control reaches end of non-void function [-Werror=return-type] >102 |case INTERP_MODE_FLAT: >|^~~~ There is an "assert" there that should be an "unreachable", patch coming up. Thanks for testing. Gert ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] [r600/sfn] Compilation error (and some warnings) - maybe to old LLVM git version, here?
[48/179] Compiling C++ object src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o FAILED: src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o ccache c++ -Isrc/gallium/drivers/r600/libr600.a.p -Isrc/gallium/drivers/r600 -I../src/gallium/drivers/r600 -Isrc -I../src -Isrc/mapi -I../src/mapi -Isrc/mesa -I../src/mesa -Iinclude -I../include -Isrc/compiler -I../src/compiler -I../src/gallium/include -Isrc/gallium/auxiliary -I../src/gallium/auxiliary -Isrc/amd/common -I../src/amd/common -Isrc/gallium/drivers -I../src/gallium/drivers -Isrc/compiler/nir -I../src/compiler/nir -Isrc/util -I../src/util -I/usr/include/libdrm -fvisibility=hidden -fdiagnostics-color=always -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wnon-virtual-dtor -std=c++14 -O3 -ffunction-sections -fdata-sections '-DPACKAGE_VERSION="21.0.0-devel"' '-DPACKAGE_BUGREPORT="https://gitlab.freedesktop.org/mesa/mesa/-/issues;' -DUSE_ELF_TLS -DHAVE_ST_VDPAU -DENABLE_ST_OMX_BELLAGIO=0 -DENABLE_ST_OMX_TIZONIA=0 -DHAVE_X11_PLATFORM -DHAVE_XCB_PLATFORM -DGLX_INDIRECT_RENDERING -DGLX_DIRECT_RENDERING -DGLX_USE_DRM -DHAVE_DRM_PLATFORM -DENABLE_SHADER_CACHE -DHAVE___BUILTIN_BSWAP32 -DHAVE___BUILTIN_BSWAP64 -DHAVE___BUILTIN_CLZ -DHAVE___BUILTIN_CLZLL -DHAVE___BUILTIN_CTZ -DHAVE___BUILTIN_EXPECT -DHAVE___BUILTIN_FFS -DHAVE___BUILTIN_FFSLL -DHAVE___BUILTIN_POPCOUNT -DHAVE___BUILTIN_POPCOUNTLL -DHAVE___BUILTIN_UNREACHABLE -DHAVE_FUNC_ATTRIBUTE_CONST -DHAVE_FUNC_ATTRIBUTE_FLATTEN -DHAVE_FUNC_ATTRIBUTE_MALLOC -DHAVE_FUNC_ATTRIBUTE_PURE -DHAVE_FUNC_ATTRIBUTE_UNUSED -DHAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT -DHAVE_FUNC_ATTRIBUTE_WEAK -DHAVE_FUNC_ATTRIBUTE_FORMAT -DHAVE_FUNC_ATTRIBUTE_PACKED -DHAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL -DHAVE_FUNC_ATTRIBUTE_ALIAS -DHAVE_FUNC_ATTRIBUTE_NORETURN -DHAVE_FUNC_ATTRIBUTE_VISIBILITY -DHAVE_UINT128 -DUSE_SSE41 -DUSE_GCC_ATOMIC_BUILTINS -DUSE_X86_64_ASM -DMAJOR_IN_SYSMACROS -DHAVE_LINUX_FUTEX_H -DHAVE_ENDIAN_H -DHAVE_DLFCN_H -DHAVE_EXECINFO_H -DHAVE_SYS_SHM_H -DHAVE_CET_H -DHAVE_STRTOF -DHAVE_MKOSTEMP -DHAVE_TIMESPEC_GET -DHAVE_MEMFD_CREATE -DHAVE_RANDOM_R -DHAVE_FLOCK -DHAVE_STRTOK_R -DHAVE_GETRANDOM -DHAVE_PROGRAM_INVOCATION_NAME -DHAVE_POSIX_MEMALIGN -DHAVE_DIRENT_D_TYPE -DHAVE_STRTOD_L -DHAVE_DLADDR -DHAVE_DL_ITERATE_PHDR -DHAVE_ZLIB -DHAVE_ZSTD -DHAVE_PTHREAD -DHAVE_PTHREAD_SETAFFINITY -DHAVE_LIBDRM -DLLVM_AVAILABLE '-DMESA_LLVM_VERSION_STRING="12.0.0"' -DLLVM_IS_SHARED=1 -DUSE_LIBGLVND=1 -DHAVE_LIBUNWIND -DHAVE_DRI3 -DHAVE_DRI3_MODIFIERS -DHAVE_LIBSENSORS=1 -Werror=return-type -Werror=empty-body -Wno-non-virtual-dtor -Wno-missing-field-initializers -Wno-format-truncation -fno-math-errno -fno-trapping-math -flifetime-dse=1 -Werror=format -Wformat-security -fPIC -pthread -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -MD -MQ src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o -MF src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o.d -o src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o -c ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp: In function ‘unsigned int r600::barycentric_ij_index(nir_intrinsic_instr*)’: ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp:102:4: error: control reaches end of non-void function [-Werror=return-type] 102 |case INTERP_MODE_FLAT: |^~~~ cc1plus: some warnings being treated as errors [56/179] Compiling C++ object src/gallium/frontends/clover/libclllvm.a.p/llvm_codegen_common.cpp.o In file included from ../src/gallium/frontends/clover/llvm/codegen/common.cpp:34: ../src/gallium/frontends/clover/llvm/metadata.hpp: In function ‘std::string clover::llvm::get_type_kernel_metadata(const llvm::Function&, const string&)’: ../src/gallium/frontends/clover/llvm/metadata.hpp:132:86: warning: ‘unsigned int llvm::VectorType::getNumElements() const’ is deprecated [-Wdeprecated-declarations] 132 | data += std::to_string(((::llvm::VectorType*)type)->getNumElements()); | ^ In file included from /usr/local/include/llvm/IR/DataLayout.h:26, from /usr/local/include/llvm/IR/Module.h:25, from ../src/gallium/frontends/clover/llvm/codegen.hpp:35, from ../src/gallium/frontends/clover/llvm/codegen/common.cpp:33: /usr/local/include/llvm/IR/DerivedTypes.h:534:10: note: declared here 534 | unsigned VectorType::getNumElements() const { | ^~ [57/179] Compiling C++ object src/gallium/frontends/clover/libclllvm.a.p/llvm_invocation.cpp.o In file included from ../src/gallium/frontends/clover/llvm/invocation.cpp:55: ../src/gallium/frontends/clover/llvm/metadata.hpp: In function ‘std::string clover::llvm::get_type_kernel_metadata(const llvm::Function&, const string&)’:
Re: [Mesa-dev] r600
Am Donnerstag, den 28.11.2019, 13:22 +1000 schrieb Dave Airlie: > On Wed, 27 Nov 2019 at 21:08, Gert Wollny > wrote: > > > > Before that I'd like to un-tabbify the whole r600 driver code, > > because all the other parts of mesa I've been touching use spaces, > > and it makes it more convenient to have the same WS handling > > everywhere. > > > I'm not against it from a style point of view, but from a it totally > breaks git history, blame, cherry-picking and many other useful > things I'd really ask you to reconsider and just use editorconfig > Fair enough, I'll keep the tabs for now and see whether I can get editorconfig to work for me. Best, Gert ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600
On Wed, 27 Nov 2019 at 21:08, Gert Wollny wrote: > > Hello Dave, > > I was wondering how much interest you still have in R600? I'm preparing > to start feeding my NIR work as MRs to continue my work in-tree. It is > currently only for Evergreen and still far from feature parity > with TGSI (no tesselation, no images, nor SSBOs), some things regress, > but some things are also fixed, so obviously the backend will only be > enabled on explicit request. > > Before that I'd like to un-tabbify the whole r600 driver code, because > all the other parts of mesa I've been touching use spaces, and it makes > it more convenient to have the same WS handling everywhere. > > Whould this be okay with you? I'm not against it from a style point of view, but from a it totally breaks git history, blame, cherry-picking and many other useful things I'd really ask you to reconsider and just use editorconfig. Maybe r600 is quiet enough now we don't have to worry about that much backports or cherry-pick, so I'll leave it up to you. At least leave sb alone since we probably want to kill that someday if your NIR backend gets there, and I doubt you want to touch it too much either. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600
Hello Dave, I was wondering how much interest you still have in R600? I'm preparing to start feeding my NIR work as MRs to continue my work in-tree. It is currently only for Evergreen and still far from feature parity with TGSI (no tesselation, no images, nor SSBOs), some things regress, but some things are also fixed, so obviously the backend will only be enabled on explicit request. Before that I'd like to un-tabbify the whole r600 driver code, because all the other parts of mesa I've been touching use spaces, and it makes it more convenient to have the same WS handling everywhere. Whould this be okay with you? Best, Gert ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600/eg : ARB_query_buffer_object initial support.
This passes the arb_query_buffer-object-qbo test in piglit, the coherent test is a bit less successful but some of that is lacking support for indirect compute anyways. I'm not going to enable GL4.5, as we haven't got CTS coverage yet, but this is one of the last bits towards GL4.5 on cayman. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 sb tessellation support
Hi Dave, Am Mittwoch, den 10.01.2018, 16:48 +1000 schrieb Dave Airlie: > This is an attempt to add tessellation support to the SB backend. > I tried to dig a bit more in the failing piglits, specifically "1in-1out" that passed with your WIP branch form Jan/9. Now, with sb it fails by drawing some pseudo-random patterns that would indicate that for some of the patches uninitialized memory is accessed when drawing the upper left sub-quad of each patch. The pattern may change when pressing some key. However, by inspecting disassembly I was not able to detect any change in the LDS addresses accessed by the optimized shaders vs. the original ones, only the access order is (mostly) reversed (which is probably the most notable change w.r.t. the WIP branch). When I change the tess-factors to 2.0 (instead of 3.0) and also the related values in the vertex and tes shader, the test passes, which should confirm that the LDS access is correct. Do I change the 3 only in tes to a positive value below 2.5 then the drawn pattern doesn't show random behaviour. Also if in the assignment to the input[].z value in the vertex shader the 64 is replaced by 64.0, making the evaluation there a floating point operation, then the random patterns in the piglit screen output disappear. Since this z-value is not used for the coordinate evaluation it should have no influence on these patters. Best, Gert ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 sb tessellation support
Am Mittwoch, den 10.01.2018, 16:48 +1000 schrieb Dave Airlie: > This is an attempt to add tessellation support to the SB backend. > > The main things needed are GDS access which is used for tess > factor storage (also used for atomic counters), and LDS access > which is needed to pass all the data between stages. > > The first 19 patches are the stuff I'm happy with, the > nop/sanity shader tests pass with those (and sb enabled). > > The last two patches make heaven work and turn on sb, > I'm not suggested these be applied as-is yet. Compared to yesterdays 42 regressions nosb versus sb it's now down to five regressions within -t tessellation: execution 1in-1out tes-input/tes-input-gl_clipdistance tes-input/tes-input-patch-mat2x4_2 tes-input/tes-input-patch-mat3x4_2 tes-input/tes-input-patch-mat4 and one failure became a crash in trivial-tess-gs_no-gs-inputs but the crash can easily be alleviated with a patch I send out later. (I tried to send it yesterday, but noted today that something had gone wrong, i.e. I did send an empty email instead). Some numbers on 6870HD: Heaven 1280x1024 Quality: High, Tesselation: Normal, Anti-Aliasing: Off FPS: 18.5 [5.7, 70] (was: 12.6 [4.0, 61] before any optimization). Tessmark x32, 1024x640 FPS: 45, 2153 points (was 10, 635) With all this: Tested-By: Gert Wollny> I think in theory enabling sb for atomics/images/compute should > be fine after this series as well, but I haven't tested that too > much. I'll check this out. Many thanks for your work on this, Gert ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600 sb tessellation support
This is an attempt to add tessellation support to the SB backend. The main things needed are GDS access which is used for tess factor storage (also used for atomic counters), and LDS access which is needed to pass all the data between stages. The first 19 patches are the stuff I'm happy with, the nop/sanity shader tests pass with those (and sb enabled). The last two patches make heaven work and turn on sb, I'm not suggested these be applied as-is yet. I think in theory enabling sb for atomics/images/compute should be fine after this series as well, but I haven't tested that too much. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600 ssbo/image fixes
I've been running deqp-gles31 over the r600 ssbo/image code it uses compute shaders, but I've found a few bugs in the in-tree code, so just sending some fixes out for those first. ssbo seems to pass all the tests, images have some heisenbug where they pass sometimes and not others. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600: cayman atomic gds support
There appears to be some bad interaction with the append/consume counters on cayman (and compute shaders at least). I traced fglrx and it appears it directly uses GDS memory. This adds cayman specific paths to directly use GDS memory for these atomics. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 evergreen+ shader image support
Am Mittwoch, den 15.11.2017, 10:11 +1000 schrieb Dave Airlie: > > It's not 100% on piglits, but it's quite close, and better than fglrx > does, so I'd probably prefer to land it before doing too much more > destructive hacking on it! I ran the piglits shader set on barts - no regressions, and all the newly tested piglits pass, i.e. basic-imagestore-const-uniform-index basic-imagestore-mixed-const-non-const-uniform-index basic-imagestore-mixed-const-non-const-uniform-index2 basic-imagestore-non-const-uniform-index arb_shader_image_load_store basic-imagestore-from-uniform disable_early_z image_checkerboard load-from-cleared-image write-to-rendered-image arb_shading_language_420pack different-bindings-image2d For the series: Tested-By: Gert WollnyBest, Gert ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600 evergreen+ shader image support
I've been hacking on this on/off for quite a while now, and I think I'm finally happy with where is has reached. It's not 100% on piglits, but it's quite close, and better than fglrx does, so I'd probably prefer to land it before doing too much more destructive hacking on it! If you have a cayman, you now get GL4.2. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600: some misc assembler and state updates
For the series: Reviewed-by: Nicolai HähnleOn 01.11.2017 00:32, Dave Airlie wrote: These are just some misc patches from the road to GL4.3 patches, They don't do anything on their own, just cleanly improve the assembler some state setting. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev -- Lerne, wie die Welt wirklich ist, Aber vergiss niemals, wie sie sein sollte. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600: some misc assembler and state updates
These are just some misc patches from the road to GL4.3 patches, They don't do anything on their own, just cleanly improve the assembler some state setting. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/AMDGPU fixes for Clover
On Fri, 2017-06-16 at 12:48 +0100, Emil Velikov wrote: > On 15 June 2017 at 14:03, Aaron Watrywrote: > > Hey all, > > > > We haven't landed the fixes to break the r600g dependency on AMDGPU yet. > > I'm headed out of town for a long weekend and don't feel like risking the > > push before being gone for five days. > > > > I've got a v3 of Emil's patch 4/5 that removes the AMDGPU header dependency > > from r600 and I'm good with the status of Jan's 3-patch series. I'm hoping > > we can square that away early next week unless is gets resolved while I'm > > gone. > > > > I've double-checked and Jan's 1-3 (squashed 2+3) alongside my 4-5 > resolve all the issues I could notice. > Pushed the lot and I'll parse through patchwork in a moment. thanks for keeping tabs on this. I've been mostly sick/travelling past 2 weeks. I'll be fully online starting Wednesday. The third patch of my series was separate, because it needed a bit more baking (I posted it to shift the discussion away from deletion), but I guess the few follow up bugs hit all the rough edges. libelf is only needed for with-opencl configurations (so android should be OK without this dep. see my response to Mauro). I'll post a patch that fixes automake this week. d5199c (Revert "amd/common: add missing libdrm include path") looks incorrect. libamd_common still includes amdgpu.h so I'd say it needs the CFLAGS. Moving the include to ac_gpu_info.c (in 81945) is enough to remove libdrm_amdgpu dependency from r600g builds. thanks, Jan > > Thanks > Emil -- Jan Vesely signature.asc Description: This is a digitally signed message part ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/AMDGPU fixes for Clover
On 15 June 2017 at 14:03, Aaron Watrywrote: > Hey all, > > We haven't landed the fixes to break the r600g dependency on AMDGPU yet. > I'm headed out of town for a long weekend and don't feel like risking the > push before being gone for five days. > > I've got a v3 of Emil's patch 4/5 that removes the AMDGPU header dependency > from r600 and I'm good with the status of Jan's 3-patch series. I'm hoping > we can square that away early next week unless is gets resolved while I'm > gone. > I've double-checked and Jan's 1-3 (squashed 2+3) alongside my 4-5 resolve all the issues I could notice. Pushed the lot and I'll parse through patchwork in a moment. Thanks Emil ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600/AMDGPU fixes for Clover
Hey all, We haven't landed the fixes to break the r600g dependency on AMDGPU yet. I'm headed out of town for a long weekend and don't feel like risking the push before being gone for five days. I've got a v3 of Emil's patch 4/5 that removes the AMDGPU header dependency from r600 and I'm good with the status of Jan's 3-patch series. I'm hoping we can square that away early next week unless is gets resolved while I'm gone. --Aaron ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600: some prelim fixes/patches for atomics
These are just some minor prelim patches for the GL4.3 work, that looked easy to split out. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600: Regarding "Failed to build shader (translation from TGSI) #99349
Hello all, Hardware; Radeon 6850HD, Mesa: mesa 17.0.1 and git (sha 531887), llvm: 4.0.0 Playing a bit around with the Unreal Editor I was confronted with the same error message reported in #99349, i.e. "Failed to build shader (translation from TGSI). After some digging though the code I found that the TGSI code [1] of the offending shader reserves 151 temporaries so that the available 128 GPRs are already allocated right from the start, and when the operation "MUL TEMP[11], CONST[26], CONST[23]" is translated to byte code, both constants are read from the cfile region, because tgsi_split_constant could not move one constant to a proper GPR. As one can see in the TGSI dump [1], the shader does not really use 151 temporaries, only 40 are actually also addresses as source, to all the other temps values are just written once (assuming the the TGSI notation is OP DEST, SRC0, SRC1 ...). My questions are now: Does the GSLS-TGSI stage of the compilation do any optimizations? Specifically, should the unused temporaries be eliminated in that step and that I get this TGSI-dump is actually a bug in this compilation stage? (In the Gallium3D wikipedia article [2] it is written that there is a TGSI optimization stage.) As far as I understand there is a optimization pass done after the TGSI translation, but because of the nature of the problem the shader is rejected before. Would it make sense to implement a patch that would work around this problem by reserving some GORs to move constants to (and the temporary that is now ctx.temp_reg), and then test the number of allocated registers only after the byte code optimization? I partially implemented something like this [3] when I tried to find the source of the bug, so I could clean that up and propose a patch, so far the graphical output is clobbered though. many thank, Gert [1] https://bugs.freedesktop.org/attachment.cgi?id=131567 (12kb, xz compressed) [2] https://en.wikipedia.org/wiki/Gallium3D#Tungsten_Graphics_Shader_In frastructure [3] https://github.com/gerddie/mesa ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/compute: cleanup evergreen_compute.c
On Wed, Apr 06, 2016 at 10:40:50PM +0100, Dave Airlie wrote: > This probably should have been cleaned up before merging, but we > were a bit lax with it. This is a bunch of cleanups and changes, > that make adding ARB_compute_support less of a task. > Acked-by: Tom Stellard> Dave. > > ___ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/compute: cleanup evergreen_compute.c
Nice cleanup. This series is, Reviewed-by: Edward O'CallaghanOn 2016-04-07 07:40, Dave Airlie wrote: This probably should have been cleaned up before merging, but we were a bit lax with it. This is a bunch of cleanups and changes, that make adding ARB_compute_support less of a task. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600/compute: cleanup evergreen_compute.c
This probably should have been cleaned up before merging, but we were a bit lax with it. This is a bunch of cleanups and changes, that make adding ARB_compute_support less of a task. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 tess branches updated
Hi, On Fri, Dec 4, 2015 at 6:19 AM, Dave Airliewrote: > Hey all, > > I've pushed an updated version of the r600g tess support to my > r600g-tess-submit branch. FWIW: Tested-by: Grazvydas Ignotas on JUNIPER XT with heaven and piglit, no issues noticed. Gražvydas ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600 tess branches updated
Hey all, I've pushed an updated version of the r600g tess support to my r600g-tess-submit branch. I'm in two minds whether we need to spam the list again, I think I've included all the review feedback so far, thanks to everyone that looked. The major changes since the last posting are: use 24-bit math operations for LDS index calculations. CAICOS/SUMO thread count changes - seems to make heaven run dropping pointless delay slots in LDS reads attempt to calculate SQ_LDS_ALLOC.HS_NUM_WAVES properly don't reeemit the LDS constant buffers if we don't have to. fix sb GDS decoder as per Glenn's request fix some minor bugs in the previous submit branch. I'll probably line to push this all next week unless anyone can find an objection!. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600 geometry shader workarounds
I've had these sitting locally and heiko on #dri-devel found they fixed some issues for him. Marek provided me with some errata and this is the results of implementing them. It is nearly all fixes for r600 era hw. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 multiple stream support + one misc debug patch
This patch series is: Reviewed-by: Edward O'Callaghan eocallag...@alterapraxis.com P.S. thanks for polishing it Dave! -- Edward O'Callaghan edward.ocallag...@koparo.com On Tue, Aug 25, 2015, at 11:18 AM, Dave Airlie wrote: This adds multiple stream support for ARB_gpu_shader5, and one other patch. It doesn't expose ARB_gpu_shader5 yet, as I think we'd like to try and get SB support for it into some sort of shape first. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600 multiple stream support + one misc debug patch
This adds multiple stream support for ARB_gpu_shader5, and one other patch. It doesn't expose ARB_gpu_shader5 yet, as I think we'd like to try and get SB support for it into some sort of shape first. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/16/2014 05:44 AM, Dave Airlie wrote: On 16 December 2014 at 08:59, Vadim Girlin vadimgir...@gmail.com wrote: On 12/16/2014 01:30 AM, Dave Airlie wrote: New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Based on my limited understanding of the code: Acked-by: Alex Deucher alexander.deuc...@amd.com Alex, thanks for the review, I understand you wanted it to get into mesa release, but it really needs careful testing with more apps, so far I hoped Dave would do it as long as he's looking into these issues anyway. In theory I can also install steam on the test machine and some games, it just needs the time and I'm not sure if I'll find it, so far my main job is sufficient to make me pretty tired. Current scheduler in SB is very fragile after adding handling for all special cases discovered during initial debugging etc, I said since the very beginning that I'd like to rewrite it, if only I had time. So any change like this can potentially break some apps even if piglit passes, and I'm not ready to take responsibility for that if I commit it myself, I just don't have time to deal with all possible consequences on all supported chips. If you think it's ok, just push this patch (it requires revert of the previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to help with it. Myself and Glenn are looking at it, Glenn noticed a piglit regression from this yesterday, I'll reproduce today and take a look. Hi, Dave Glenn, Thanks for looking into it. FWIW, when I worked on it I've ran piglit's quick tests and didn't see any regressions on evergreen (juniper 5750). There were some failed tests in some piglit runs, but AFAIU they were just random. Turns out we had a pre-existing fail that we noticed, not a regression. I'm going to push this, since its better than what is there, we can see if some public testing notices any big issues also. Thanks, Dave. I'm really sorry that I can't pay as much attention to that code as I'd like, and I really appreciate your and Glenn's efforts for maintaining it. (In case if someone thinks it's my fault, I must remind, I warned that I won't be able to support it even before it was merged. So please don't blame me :) ). ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600 atomics integration
Hi, I am trying to integrate atomic counters to Mesa. I was able to do until pipes (with the help of Marek and Ilia). How to integrate them to R600? Thank you! -- Regards, *Aditya Atluri,* *USA.* ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/12/2014 05:28 PM, Alex Deucher wrote: On Wed, Dec 10, 2014 at 6:50 AM, Vadim Girlin vadimgir...@gmail.com wrote: On 12/09/2014 07:39 AM, Vadim Girlin wrote: On 12/09/2014 05:18 AM, Dave Airlie wrote: On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This does seem to fix the problems in piglit, and looks close to what I was attempting but written by someone who knows what they are doing :-) What is the sb_sched.cpp change for at the end for? It fixes those scheduler/regalloc errors for switch tests. Unfortunately, now I've installed some benchmarks for testing and AFAICS this patch breaks at least lightsmark 2008, so it seems the condition removed by the patch was there for a reason. I'll probably try to come up with better fix. New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Based on my limited understanding of the code: Acked-by: Alex Deucher alexander.deuc...@amd.com Alex, thanks for the review, I understand you wanted it to get into mesa release, but it really needs careful testing with more apps, so far I hoped Dave would do it as long as he's looking into these issues anyway. In theory I can also install steam on the test machine and some games, it just needs the time and I'm not sure if I'll find it, so far my main job is sufficient to make me pretty tired. Current scheduler in SB is very fragile after adding handling for all special cases discovered during initial debugging etc, I said since the very beginning that I'd like to rewrite it, if only I had time. So any change like this can potentially break some apps even if piglit passes, and I'm not ready to take responsibility for that if I commit it myself, I just don't have time to deal with all possible consequences on all supported chips. If you think it's ok, just push this patch (it requires revert of the previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to help with it. Vadim Vadim Vadim Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Based on my limited understanding of the code: Acked-by: Alex Deucher alexander.deuc...@amd.com Alex, thanks for the review, I understand you wanted it to get into mesa release, but it really needs careful testing with more apps, so far I hoped Dave would do it as long as he's looking into these issues anyway. In theory I can also install steam on the test machine and some games, it just needs the time and I'm not sure if I'll find it, so far my main job is sufficient to make me pretty tired. Current scheduler in SB is very fragile after adding handling for all special cases discovered during initial debugging etc, I said since the very beginning that I'd like to rewrite it, if only I had time. So any change like this can potentially break some apps even if piglit passes, and I'm not ready to take responsibility for that if I commit it myself, I just don't have time to deal with all possible consequences on all supported chips. If you think it's ok, just push this patch (it requires revert of the previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to help with it. Myself and Glenn are looking at it, Glenn noticed a piglit regression from this yesterday, I'll reproduce today and take a look. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/16/2014 01:30 AM, Dave Airlie wrote: New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Based on my limited understanding of the code: Acked-by: Alex Deucher alexander.deuc...@amd.com Alex, thanks for the review, I understand you wanted it to get into mesa release, but it really needs careful testing with more apps, so far I hoped Dave would do it as long as he's looking into these issues anyway. In theory I can also install steam on the test machine and some games, it just needs the time and I'm not sure if I'll find it, so far my main job is sufficient to make me pretty tired. Current scheduler in SB is very fragile after adding handling for all special cases discovered during initial debugging etc, I said since the very beginning that I'd like to rewrite it, if only I had time. So any change like this can potentially break some apps even if piglit passes, and I'm not ready to take responsibility for that if I commit it myself, I just don't have time to deal with all possible consequences on all supported chips. If you think it's ok, just push this patch (it requires revert of the previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to help with it. Myself and Glenn are looking at it, Glenn noticed a piglit regression from this yesterday, I'll reproduce today and take a look. Hi, Dave Glenn, Thanks for looking into it. FWIW, when I worked on it I've ran piglit's quick tests and didn't see any regressions on evergreen (juniper 5750). There were some failed tests in some piglit runs, but AFAIU they were just random. If there are any problems with this fix, I'll be glad to try to help, if time allows. Vadim Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 16 December 2014 at 08:59, Vadim Girlin vadimgir...@gmail.com wrote: On 12/16/2014 01:30 AM, Dave Airlie wrote: New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Based on my limited understanding of the code: Acked-by: Alex Deucher alexander.deuc...@amd.com Alex, thanks for the review, I understand you wanted it to get into mesa release, but it really needs careful testing with more apps, so far I hoped Dave would do it as long as he's looking into these issues anyway. In theory I can also install steam on the test machine and some games, it just needs the time and I'm not sure if I'll find it, so far my main job is sufficient to make me pretty tired. Current scheduler in SB is very fragile after adding handling for all special cases discovered during initial debugging etc, I said since the very beginning that I'd like to rewrite it, if only I had time. So any change like this can potentially break some apps even if piglit passes, and I'm not ready to take responsibility for that if I commit it myself, I just don't have time to deal with all possible consequences on all supported chips. If you think it's ok, just push this patch (it requires revert of the previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to help with it. Myself and Glenn are looking at it, Glenn noticed a piglit regression from this yesterday, I'll reproduce today and take a look. Hi, Dave Glenn, Thanks for looking into it. FWIW, when I worked on it I've ran piglit's quick tests and didn't see any regressions on evergreen (juniper 5750). There were some failed tests in some piglit runs, but AFAIU they were just random. Turns out we had a pre-existing fail that we noticed, not a regression. I'm going to push this, since its better than what is there, we can see if some public testing notices any big issues also. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On Wed, Dec 10, 2014 at 6:50 AM, Vadim Girlin vadimgir...@gmail.com wrote: On 12/09/2014 07:39 AM, Vadim Girlin wrote: On 12/09/2014 05:18 AM, Dave Airlie wrote: On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This does seem to fix the problems in piglit, and looks close to what I was attempting but written by someone who knows what they are doing :-) What is the sb_sched.cpp change for at the end for? It fixes those scheduler/regalloc errors for switch tests. Unfortunately, now I've installed some benchmarks for testing and AFAICS this patch breaks at least lightsmark 2008, so it seems the condition removed by the patch was there for a reason. I'll probably try to come up with better fix. New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Based on my limited understanding of the code: Acked-by: Alex Deucher alexander.deuc...@amd.com Vadim Vadim Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/09/2014 07:39 AM, Vadim Girlin wrote: On 12/09/2014 05:18 AM, Dave Airlie wrote: On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This does seem to fix the problems in piglit, and looks close to what I was attempting but written by someone who knows what they are doing :-) What is the sb_sched.cpp change for at the end for? It fixes those scheduler/regalloc errors for switch tests. Unfortunately, now I've installed some benchmarks for testing and AFAICS this patch breaks at least lightsmark 2008, so it seems the condition removed by the patch was there for a reason. I'll probably try to come up with better fix. New patch is attached, the only difference is in the sb_sched.cpp (it disables copy coalescing for some unsafe cases, so it may leave more MOVs than previously, but I don't think there will be any noticeable effect on performance). So far I don't see any problems with it, but I don't have many GL apps on the test machine. At least lightsmark and unigine demos work for me. Vadim Vadim Dave. From d2d16fa39c7b4e871d67e05bad92a540d7e5ea68 Mon Sep 17 00:00:00 2001 From: Vadim Girlin vadimgir...@gmail.com Date: Wed, 10 Dec 2014 14:41:10 +0300 Subject: [PATCH] r600g/sb: fix issues with loops created for switch --- src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 2 ++ src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 2 ++ src/gallium/drivers/r600/sb/sb_if_conversion.cpp | 4 ++-- src/gallium/drivers/r600/sb/sb_ir.h | 9 +++-- src/gallium/drivers/r600/sb/sb_sched.cpp | 3 +++ 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index f0849ca..3f362c4 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -110,6 +110,8 @@ int bc_finalizer::run() { void bc_finalizer::finalize_loop(region_node* r) { + update_nstack(r); + cf_node *loop_start = sh.create_cf(CF_OP_LOOP_START_DX10); cf_node *loop_end = sh.create_cf(CF_OP_LOOP_END); diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index d787e5b..403f938 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -758,6 +758,8 @@ int bc_parser::prepare_loop(cf_node* c) { c-insert_before(reg); rep-move(c, end-next); + reg-src_loop = true; + loop_stack.push(reg); return 0; } diff --git a/src/gallium/drivers/r600/sb/sb_if_conversion.cpp b/src/gallium/drivers/r600/sb/sb_if_conversion.cpp index 93edace..3f2b1b1 100644 --- a/src/gallium/drivers/r600/sb/sb_if_conversion.cpp +++ b/src/gallium/drivers/r600/sb/sb_if_conversion.cpp @@ -115,13 +115,13 @@ void if_conversion::convert_kill_instructions(region_node *r, bool if_conversion::check_and_convert(region_node *r) { depart_node *nd1 = static_castdepart_node*(r-first); - if (!nd1-is_depart()) + if (!nd1-is_depart() || nd1-target != r) return false; if_node *nif = static_castif_node*(nd1-first); if (!nif-is_if())
Re: [Mesa-dev] r600/sb loop issue
On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. Vadim sb has the -is_loop() and it just checks !repeats.empty(), so this meant in the finalizer code we'd fall into the if statement which would then assert. I hacked/fixed (more hacked), this in 7b0067d23a6f64cf83c42e7f11b2cd4100c569fe which attempts to detect single pass loops and handle things that way. However this lead to stack depth calculations being incorrectly done, so I moved the single loop detect into the is_loop check, (see attached patch). This fixes the rendering in some places, but lead to a regression in tests/shaders/glsl-vs-continue-in-switch-in-do-while.shader_test error at : PHI t76||FP@R3.x, t128||FP@R3.x, t115||FP@R3.x, t102||FP@R3.x, t89||FP@R3.x : expected operand value t115||FP@R3.x, gpr contains t17||FP@R3.x error at : PHI t76||FP@R3.x, t128||FP@R3.x, t115||FP@R3.x, t102||FP@R3.x, t89||FP@R3.x : expected operand value t102||FP@R3.x, gpr contains t17||FP@R3.x Now Glenn suspected this was due to the is_loop check in sb_shader.cpp:create_bbs, and changing that check to only detect repeating loops removes that issue, but introduces stack sizing issues again, resulting in lockups/random rendering. So I just want to ask had you considered single loops with an always break in sb design, I didn't see such loops with any test cases, so I didn't even think about it. and perhaps some idea where things are going so wrong with the register alloc above. Not sure, but as long as the only repeat node is optimized away in bc_parser because it's useless due to unconditional break, I suspect it may be not easy to make all other code think that it's still a loop. I've tried a quick fix to not optimize the repeat away for such loops, but it results in other issues, probably it will require handling this as a special case in other places, so it doesn't look like a good idea either. I'll try to implement the solution that I described above, that is, translate resulting control flow back to ISA. If it won't be too much work, it's probably the best way and it won't use loop instructions in the end. I suspect I'll keep digging into this, but its getting to the edges of the brain space/time I can find! Dave. From 4967ef90847f921fc0ef7c018ae7ae8048d2a6ce Mon Sep 17 00:00:00 2001 From: Vadim Girlin vadimgir...@gmail.com Date: Mon, 8 Dec 2014 13:11:48 +0300 Subject: [PATCH] r600g/sb: fix issues with loops created for switch statements --- src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 2 ++ src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 2 ++ src/gallium/drivers/r600/sb/sb_if_conversion.cpp | 4 ++-- src/gallium/drivers/r600/sb/sb_ir.h | 9 +++-- src/gallium/drivers/r600/sb/sb_sched.cpp | 2 +- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index f0849ca..3f362c4 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -110,6 +110,8 @@ int bc_finalizer::run() { void bc_finalizer::finalize_loop(region_node*
Re: [Mesa-dev] r600/sb loop issue
On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This fixes one thing, but the switches are still broken here on cayman at least tests/spec/glsl-1.30/execution/switch/fs-default_last.shader_test -- FRAG PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1 DCL OUT[0], COLOR DCL CONST[0] DCL TEMP[0..2], LOCAL IMM[0] FLT32 {0., 1., 0., 0.} IMM[1] UINT32 {0, 4294967295, 0, 0} IMM[2] INT32 {1, 0, 0, 0} 0: MOV TEMP[0], IMM[0]. 1: MOV TEMP[1].x, IMM[1]. 2: BGNLOOP :0 3: UCMP TEMP[1].x, CONST[0]., TEMP[1]., IMM[1]. 4: UIF TEMP[1]. :0 5: MOV TEMP[0].x, IMM[0]. 6: BRK 7: ENDIF 8: USEQ TEMP[2].x, IMM[2]., CONST[0]. 9: UCMP TEMP[1].x, TEMP[2]., IMM[1]., TEMP[1]. 10: UIF TEMP[1]. :0 11: MOV TEMP[0].y, IMM[0]. 12: BRK 13: ENDIF 14: MOV TEMP[1].x, IMM[1]. 15: MOV TEMP[0].z, IMM[0]. 16: BRK 17: ENDLOOP :0 18: MOV OUT[0], TEMP[0] 19: END = SHADER #13 PS/CAYMAN/CAYMAN = = 72 dw = 6 gprs = 2 stack = 0012 a010 ALU 5 @36 0036 00f8 00200c90 1 x: MOVR1.x, 0 0038 00f8 20200c90y: MOVR1.y, 0 0040 00f8 40200c90z: MOVR1.z, 0 0042 80f8 60200c90w: MOVR1.w, 0 0044 80f8 00400c90 2 x: MOVR2.x, 0 0002 000f 8180 LOOP_START_DX10 @30 0004 4017 a404 ALU_PUSH_BEFORE 2 @46 KC0[CB0:0-15] 0046 809f6080 0043c002 3 x: CNDGE_INT R2.x, KC0[0].x, -1, R2.x 0048 801f00fe 00a0229c 4 MP x: PRED_SETNE_INT R5.x, PV.x, 0 0006 0007 8281 JUMP @14 POP:1 0008 0019 a000 ALU 1 @50 0050 84f9 00200c90 5 x: MOVR1.x, 1.0 0010 000e 8240 LOOP_BREAK @28 0012 0007 8381 POP @14 POP:1 0014 401a a408 ALU_PUSH_BEFORE 3 @52 KC0[CB0:0-15] 0052 801000fa 00601d10 6 x: SETE_INT R3.x, 1, KC0[0].x 0054 800040fe 0043c4fb 7 x: CNDGE_INT R2.x, PV.x, R2.x, -1 0056 801f00fe 00a0229c 8 MP x: PRED_SETNE_INT R5.x, PV.x, 0 0016 000c 8281 JUMP @24 POP:1 0018 001d a000 ALU 1 @58 0058 84f9 20200c90 9 y: MOVR1.y, 1.0 0020 000e 8240 LOOP_BREAK @28 0022 000c 8381 POP @24 POP:1 0024 001e a004 ALU 2 @60 0060 04fb 00400c9010 x: MOVR2.x, -1 0062 84f9 40200c90z: MOVR1.z, 1.0 0026 000e 8240 LOOP_BREAK @28 0028 0002 8140 LOOP_END @4 0030 0020 a00c ALU 4 @64 0064 0001 0c9011 x: MOVR0.x, R1.x 0066 0401 2c90y: MOVR0.y, R1.y 0068 0801 4c90z: MOVR0.z, R1.z 0070 8c01 6c90w: MOV
Re: [Mesa-dev] r600/sb loop issue
On 9 December 2014 at 10:25, Dave Airlie airl...@gmail.com wrote: On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This fixes one thing, but the switches are still broken here on cayman at least Actually ignore that, another regression snuck into r600g that I had to fix. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This does seem to fix the problems in piglit, and looks close to what I was attempting but written by someone who knows what they are doing :-) What is the sb_sched.cpp change for at the end for? Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/09/2014 05:18 AM, Dave Airlie wrote: On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:13 AM, Vadim Girlin wrote: On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. In fact handling that control flow in finalizer is not as easy as I hoped, probably impossible, at least if we want to make it efficient. I forgot about the limitations of R600 ISA. OTOH it seems I've managed to fix the issues with loops, the patch is attached (it's meant to be used instead of 7b0067d2). There are no piglit regressions on evergreen, but I didn't test any real apps. This does seem to fix the problems in piglit, and looks close to what I was attempting but written by someone who knows what they are doing :-) What is the sb_sched.cpp change for at the end for? It fixes those scheduler/regalloc errors for switch tests. Unfortunately, now I've installed some benchmarks for testing and AFAICS this patch breaks at least lightsmark 2008, so it seems the condition removed by the patch was there for a reason. I'll probably try to come up with better fix. Vadim Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/04/2014 01:43 AM, Dave Airlie wrote: Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. Hi, Dave, I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I think loops are less efficient than other control flow instructions on r600g hw (at least because they increase stack usage), and possibly on other hw too. In fact it seems sb basically gets rid of it already in IR, it just doesn't know how to translate resulting control flow to ISA, because so far it only supports specific control flow structure for if-then-else that was previously preserved during optimizations. I think it may be not very hard to implement support for that in finalizer, I'll look into it. sb has the -is_loop() and it just checks !repeats.empty(), so this meant in the finalizer code we'd fall into the if statement which would then assert. I hacked/fixed (more hacked), this in 7b0067d23a6f64cf83c42e7f11b2cd4100c569fe which attempts to detect single pass loops and handle things that way. However this lead to stack depth calculations being incorrectly done, so I moved the single loop detect into the is_loop check, (see attached patch). This fixes the rendering in some places, but lead to a regression in tests/shaders/glsl-vs-continue-in-switch-in-do-while.shader_test error at : PHI t76||FP@R3.x, t128||FP@R3.x, t115||FP@R3.x, t102||FP@R3.x, t89||FP@R3.x : expected operand value t115||FP@R3.x, gpr contains t17||FP@R3.x error at : PHI t76||FP@R3.x, t128||FP@R3.x, t115||FP@R3.x, t102||FP@R3.x, t89||FP@R3.x : expected operand value t102||FP@R3.x, gpr contains t17||FP@R3.x Now Glenn suspected this was due to the is_loop check in sb_shader.cpp:create_bbs, and changing that check to only detect repeating loops removes that issue, but introduces stack sizing issues again, resulting in lockups/random rendering. So I just want to ask had you considered single loops with an always break in sb design, I didn't see such loops with any test cases, so I didn't even think about it. and perhaps some idea where things are going so wrong with the register alloc above. Not sure, but as long as the only repeat node is optimized away in bc_parser because it's useless due to unconditional break, I suspect it may be not easy to make all other code think that it's still a loop. I've tried a quick fix to not optimize the repeat away for such loops, but it results in other issues, probably it will require handling this as a special case in other places, so it doesn't look like a good idea either. I'll try to implement the solution that I described above, that is, translate resulting control flow back to ISA. If it won't be too much work, it's probably the best way and it won't use loop instructions in the end. I suspect I'll keep digging into this, but its getting to the edges of the brain space/time I can find! Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com wrote: I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I don't think that's true. I don't see anything in the spec that would lead me to believe continue cannot occur in a switch statement. In fact, we have some relatively complicated shaders that have a continue in a switch. See tests/shaders/glsl-fs-continue-in-switch-in-do-while.shader_test ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/06/2014 07:50 AM, Matt Turner wrote: On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com wrote: I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I don't think that's true. I don't see anything in the spec that would lead me to believe continue cannot occur in a switch statement. I've double-checked some versions of GLSL spec (1.30, 1.50, 3.30, 4.40) and all of them say the same (section 6.4 Jumps): The continue jump is used only in loops. In fact, we have some relatively complicated shaders that have a continue in a switch. See tests/shaders/glsl-fs-continue-in-switch-in-do-while.shader_test ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On Fri, Dec 5, 2014 at 8:56 PM, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:50 AM, Matt Turner wrote: On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com wrote: I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I don't think that's true. I don't see anything in the spec that would lead me to believe continue cannot occur in a switch statement. I've double-checked some versions of GLSL spec (1.30, 1.50, 3.30, 4.40) and all of them say the same (section 6.4 Jumps): The continue jump is used only in loops. Sure, but isn't the continue below in a loop? do { switch (...) { case ...: continue; } } while (...); The grammar is pretty unambiguous. jump_statement: CONTINUE SEMICOLON BREAK SEMICOLON RETURN SEMICOLON RETURN expression SEMICOLON DISCARD SEMICOLON // Fragment shader only. If continue can't be in a switch, neither can break. :) ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600/sb loop issue
On 12/06/2014 08:01 AM, Matt Turner wrote: On Fri, Dec 5, 2014 at 8:56 PM, Vadim Girlin vadimgir...@gmail.com wrote: On 12/06/2014 07:50 AM, Matt Turner wrote: On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com wrote: I suspect we should rather get rid of such loops somehow, i.e. convert to something else, the loop that never repeats is not really a loop anyway. AFAICS continue is not supported in switch statements according to GLSL specs, so the loops generated for switch will never be repeated. Am I missing something? Even if repeating is possible somehow, at least we can get rid of the loops that are not repeated. I don't think that's true. I don't see anything in the spec that would lead me to believe continue cannot occur in a switch statement. I've double-checked some versions of GLSL spec (1.30, 1.50, 3.30, 4.40) and all of them say the same (section 6.4 Jumps): The continue jump is used only in loops. Sure, but isn't the continue below in a loop? do { switch (...) { case ...: continue; } } while (...); Ah, now I see, you're right. I just was mostly thinking about that loop that is created for a switch in IR, not about source, and somehow confused these things. Thanks for pointing that out. Hopefully such cases won't complicate the problem in sb even more, need to check those tests. The grammar is pretty unambiguous. jump_statement: CONTINUE SEMICOLON BREAK SEMICOLON RETURN SEMICOLON RETURN expression SEMICOLON DISCARD SEMICOLON // Fragment shader only. If continue can't be in a switch, neither can break. :) ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600/sb loop issue
Hi Vadim, I've been looking with Glenn's help into a bug in sb for a couple of weeks now triggered by a change in how GLSL generates switch statements. I understand you probably aren't too interested in r600g but I believe I'm hitting a design level problem and I would like some advice. So it appears that GLSL can create loops that don't repeat for switch statements, and it appears SB wasn't ready to handle such a thing. sb has the -is_loop() and it just checks !repeats.empty(), so this meant in the finalizer code we'd fall into the if statement which would then assert. I hacked/fixed (more hacked), this in 7b0067d23a6f64cf83c42e7f11b2cd4100c569fe which attempts to detect single pass loops and handle things that way. However this lead to stack depth calculations being incorrectly done, so I moved the single loop detect into the is_loop check, (see attached patch). This fixes the rendering in some places, but lead to a regression in tests/shaders/glsl-vs-continue-in-switch-in-do-while.shader_test error at : PHI t76||FP@R3.x, t128||FP@R3.x, t115||FP@R3.x, t102||FP@R3.x, t89||FP@R3.x : expected operand value t115||FP@R3.x, gpr contains t17||FP@R3.x error at : PHI t76||FP@R3.x, t128||FP@R3.x, t115||FP@R3.x, t102||FP@R3.x, t89||FP@R3.x : expected operand value t102||FP@R3.x, gpr contains t17||FP@R3.x Now Glenn suspected this was due to the is_loop check in sb_shader.cpp:create_bbs, and changing that check to only detect repeating loops removes that issue, but introduces stack sizing issues again, resulting in lockups/random rendering. So I just want to ask had you considered single loops with an always break in sb design, and perhaps some idea where things are going so wrong with the register alloc above. I suspect I'll keep digging into this, but its getting to the edges of the brain space/time I can find! Dave. From 170184b712d9596f761acdee2c7cff2a2792d937 Mon Sep 17 00:00:00 2001 From: Dave Airlie airl...@redhat.com Date: Wed, 3 Dec 2014 13:05:18 +1000 Subject: [PATCH] r600g/sb: detect empty once iterated loops Since GLSL changed to using loops for switches, we've hit a bug in sb with single execution loops, I previously attempted to fix this by changing where we detected loops, however this isn't good enough as SB gets the stack sizing wrong. Fix this by checking inside the is_loop for single execution loops. This should fix lockups on rv635 and misrenderings on cayman since the first fix: 7b0067d23a6f64cf83c42e7f11b2cd4100c569fe fix issues cause by GLSL switching to loops for switch Signed-off-by: Dave Airlie airl...@redhat.com --- src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 22 +++ src/gallium/drivers/r600/sb/sb_ir.cpp | 30 ++ src/gallium/drivers/r600/sb/sb_ir.h| 3 +-- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index 0fa0910..56189c9 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -46,19 +46,7 @@ int bc_finalizer::run() { for (regions_vec::reverse_iterator I = rv.rbegin(), E = rv.rend(); I != E; ++I) { region_node *r = *I; - bool is_if = false; - assert(r); - - assert(r-first); - if (r-first-is_container()) { - container_node *repdep1 = static_castcontainer_node*(r-first); - assert(repdep1-is_depart() || repdep1-is_repeat()); - if_node *n_if = static_castif_node*(repdep1-first); - if (n_if n_if-is_if()) -is_if = true; - } - - if (is_if) + if (!r-is_loop()) finalize_if(r); else finalize_loop(r); @@ -121,7 +109,13 @@ void bc_finalizer::finalize_loop(region_node* r) { cf_node *loop_end = sh.create_cf(CF_OP_LOOP_END); bool has_instr = false; - if (!r-is_loop()) { + /* + * if we have repeats then we have instructions, + * if we have no repeats for a single loops, + * check if there are any instructions in the depart + * nodes. + */ + if (r-repeats.empty()) { for (depart_vec::iterator I = r-departs.begin(), E = r-departs.end(); I != E; ++I) { depart_node *dep = *I; diff --git a/src/gallium/drivers/r600/sb/sb_ir.cpp b/src/gallium/drivers/r600/sb/sb_ir.cpp index 5226893..a4c4e83 100644 --- a/src/gallium/drivers/r600/sb/sb_ir.cpp +++ b/src/gallium/drivers/r600/sb/sb_ir.cpp @@ -485,6 +485,36 @@ void container_node::collect_stats(node_stats s) { } } +bool region_node::is_loop() { + + if (!repeats.empty()) + return true; + + /* + * single pass loops have no repeats, however we need to detect + * them as loops. + */ + + /* if we have no first in the region then it can't be a loop. */ + if (!first) + return false; + + /* + * if the first is a container, see if it has an if node, + * if nodes aren't loops, if there is no if node, + * then this is a single pass loops. + */ + if (first-is_container()) { + container_node
Re: [Mesa-dev] R600/OpenCL - kernel_param resource
On Thu, Apr 10, 2014 at 03:24:32PM +, Dorrington, Albert wrote: I am having an issue with a memory leak in an OpenCL program I am testing. In the program I call the same kernel repeatedly, for every pixel in an image. (Probably not the most efficient code, but it is a learning/testing thing.) One thing in particular I have not yet been able to figure out, is what releases the reference counts for the shader-kernel_param resource created in evergreen_compute_upload_input(). Tracing through the calls: evergreen_compute_upload_input() evergreen_cs_set_constant_buffer() r600_set_constant_buffer() I can see that if r600_set_constant_buffer() is passed a null pipe_constant_buffer input, that it would reset the stat masks and make the call to pipe_resource_reference() with a NULL, to decrement the count. But I don't see where that would happen. I am thinking that perhaps there should be something to release the reference count for that buffer, either after the evergreen_launch_grid() call, or perhaps as the last thing within that call, after the compute_emit_cs() call. Or, is this call happening somewhere else that I haven't found? kernel_param is probably the source of the leak, it doesn't look like we are destroying it anywhere. -Tom Thanks, Al Dorrington Software Engineer Sr Lockheed Martin, Mission Systems and Training ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600/OpenCL - kernel_param resource
I am having an issue with a memory leak in an OpenCL program I am testing. In the program I call the same kernel repeatedly, for every pixel in an image. (Probably not the most efficient code, but it is a learning/testing thing.) One thing in particular I have not yet been able to figure out, is what releases the reference counts for the shader-kernel_param resource created in evergreen_compute_upload_input(). Tracing through the calls: evergreen_compute_upload_input() evergreen_cs_set_constant_buffer() r600_set_constant_buffer() I can see that if r600_set_constant_buffer() is passed a null pipe_constant_buffer input, that it would reset the stat masks and make the call to pipe_resource_reference() with a NULL, to decrement the count. But I don't see where that would happen. I am thinking that perhaps there should be something to release the reference count for that buffer, either after the evergreen_launch_grid() call, or perhaps as the last thing within that call, after the compute_emit_cs() call. Or, is this call happening somewhere else that I haven't found? Thanks, Al Dorrington Software Engineer Sr Lockheed Martin, Mission Systems and Training ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600+700 geometry shader patch
I've lightly tested this, not piglit strength yet, and it does require the kernel patch to work. its also available in a branch in my repo r600-geom-shaders. Dave. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI: New intrinsics for radeonsi geometry shaders
On Fri, Jan 24, 2014 at 03:17:04PM +0900, Michel Dänzer wrote: The attached patches add two intrinsics to the R600 backend which are necessary for geometry shader support in the radeonsi driver. Patch 1 and v2 of Patch 2 are: Reviewed-by: Tom Stellard thomas.stell...@amd.com -Tom -- Earthling Michel Dänzer| http://www.amd.com Libre software enthusiast |Mesa and X developer From 8feb7201ac894e5a6731a157020ac807936f584d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Fri, 29 Nov 2013 18:21:41 +0900 Subject: [PATCH 1/2] R600/SI: Add intrinsic for S_SENDMSG instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp | 31 +++ lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h | 1 + lib/Target/R600/SIInsertWaits.cpp | 6 + lib/Target/R600/SIInstructions.td | 16 ++-- lib/Target/R600/SIIntrinsics.td | 2 ++ test/CodeGen/R600/llvm.SI.sendmsg.ll | 21 +++ 6 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/R600/llvm.SI.sendmsg.ll diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 99e1377..7105879 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -316,6 +316,37 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + raw_ostream O) { + unsigned SImm16 = MI-getOperand(OpNo).getImm(); + unsigned Msg = SImm16 0xF; + if (Msg == 2 || Msg == 3) { +unsigned Op = (SImm16 4) 0xF; +if (Msg == 3) + O Gs_done(; +else + O Gs(; +if (Op == 0) { + O nop; +} else { + unsigned Stream = (SImm16 8) 0x3; + if (Op == 1) + O cut; + else if (Op == 2) + O emit; + else if (Op == 3) + O emit-cut; + O stream Stream; +} +O ), [m0] ; + } else if (Msg == 1) +O interrupt ; + else if (Msg == 15) +O system ; + else +O unknown( Msg ) ; +} + void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream O) { // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 77af942..2876dd2 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -53,6 +53,7 @@ private: void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream O); void printCT(const MCInst *MI, unsigned OpNo, raw_ostream O); void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream O); + void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream O); void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream O); }; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7ef662e..695ec40 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -314,6 +314,12 @@ Counters SIInsertWaits::handleOperands(MachineInstr MI) { Counters Result = ZeroCounts; + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (MI.getOpcode() == AMDGPU::S_SENDMSG) +return LastIssued; + // For each register affected by this // instruction increase the result sequence for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 3baa4cd..c0ad398 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -22,6 +22,10 @@ def InterpSlot : Operandi32 { let PrintMethod = printInterpSlot; } +def SendMsgImm : Operandi32 { + let PrintMethod = printSendMsg; +} + def isSI : PredicateSubtarget.getGeneration() = AMDGPUSubtarget::SOUTHERN_ISLANDS; @@ -826,17 +830,25 @@ def S_BARRIER : SOPP 0x000a, (ins), S_BARRIER, def S_WAITCNT : SOPP 0x000c, (ins WAIT_FLAG:$simm16), S_WAITCNT $simm16, [] ; -} // End hasSideEffects //def S_SETHALT : SOPP_ 0x000d, S_SETHALT, []; //def S_SLEEP : SOPP_ 0x000e, S_SLEEP, []; //def S_SETPRIO : SOPP_ 0x000f, S_SETPRIO, []; -//def S_SENDMSG : SOPP_ 0x0010, S_SENDMSG, []; + +let Uses = [EXEC] in { +
[Mesa-dev] R600/SI: New intrinsics for radeonsi geometry shaders
The attached patches add two intrinsics to the R600 backend which are necessary for geometry shader support in the radeonsi driver. -- Earthling Michel Dänzer| http://www.amd.com Libre software enthusiast |Mesa and X developer From 8feb7201ac894e5a6731a157020ac807936f584d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Fri, 29 Nov 2013 18:21:41 +0900 Subject: [PATCH 1/2] R600/SI: Add intrinsic for S_SENDMSG instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp | 31 +++ lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h | 1 + lib/Target/R600/SIInsertWaits.cpp | 6 + lib/Target/R600/SIInstructions.td | 16 ++-- lib/Target/R600/SIIntrinsics.td | 2 ++ test/CodeGen/R600/llvm.SI.sendmsg.ll | 21 +++ 6 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 test/CodeGen/R600/llvm.SI.sendmsg.ll diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 99e1377..7105879 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -316,6 +316,37 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, + raw_ostream O) { + unsigned SImm16 = MI-getOperand(OpNo).getImm(); + unsigned Msg = SImm16 0xF; + if (Msg == 2 || Msg == 3) { +unsigned Op = (SImm16 4) 0xF; +if (Msg == 3) + O Gs_done(; +else + O Gs(; +if (Op == 0) { + O nop; +} else { + unsigned Stream = (SImm16 8) 0x3; + if (Op == 1) + O cut; + else if (Op == 2) + O emit; + else if (Op == 3) + O emit-cut; + O stream Stream; +} +O ), [m0] ; + } else if (Msg == 1) +O interrupt ; + else if (Msg == 15) +O system ; + else +O unknown( Msg ) ; +} + void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream O) { // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 77af942..2876dd2 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -53,6 +53,7 @@ private: void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream O); void printCT(const MCInst *MI, unsigned OpNo, raw_ostream O); void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream O); + void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream O); void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream O); }; diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7ef662e..695ec40 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -314,6 +314,12 @@ Counters SIInsertWaits::handleOperands(MachineInstr MI) { Counters Result = ZeroCounts; + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (MI.getOpcode() == AMDGPU::S_SENDMSG) +return LastIssued; + // For each register affected by this // instruction increase the result sequence for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 3baa4cd..c0ad398 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -22,6 +22,10 @@ def InterpSlot : Operandi32 { let PrintMethod = printInterpSlot; } +def SendMsgImm : Operandi32 { + let PrintMethod = printSendMsg; +} + def isSI : PredicateSubtarget.getGeneration() = AMDGPUSubtarget::SOUTHERN_ISLANDS; @@ -826,17 +830,25 @@ def S_BARRIER : SOPP 0x000a, (ins), S_BARRIER, def S_WAITCNT : SOPP 0x000c, (ins WAIT_FLAG:$simm16), S_WAITCNT $simm16, [] ; -} // End hasSideEffects //def S_SETHALT : SOPP_ 0x000d, S_SETHALT, []; //def S_SLEEP : SOPP_ 0x000e, S_SLEEP, []; //def S_SETPRIO : SOPP_ 0x000f, S_SETPRIO, []; -//def S_SENDMSG : SOPP_ 0x0010, S_SENDMSG, []; + +let Uses = [EXEC] in { + def S_SENDMSG : SOPP 0x0010, (ins SendMsgImm:$simm16, M0Reg:$m0), S_SENDMSG $simm16, + [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)] + { +let DisableEncoding = $m0; + } +} // End Uses = [EXEC] + //def S_SENDMSGHALT : SOPP_ 0x0011, S_SENDMSGHALT, []; //def S_TRAP : SOPP_ 0x0012,
[Mesa-dev] r600 Evergreen Compute and compute_memory_grow_pool
The compute memory pool used by the gallium r600 driver seems to be problematic. The pool looks to be a single radeon buffer object. There could be multiple maps set up into that single buffer object. If there is a need to grow the pool, then the resource associated with the buffer object is destroyed, which results in all of the maps for that buffer object also being destroyed. When the new larger pool is created, the pointers that the application has to the mapped region are no longer valid. A temporary work-around would appear to be to make sure that the buffer pool is large enough that there isn't a need to grow the pool once any maps into it are created. A longer term solution seems much harder. Even if the maps could all be precisely recreated into the newly allocated buffer object, there would be a period of time when the pointers held by the application would be invalid. John Hrustich LM Master Software Architect, Mission Systems and Training Lockheed Martin Corporation 1801 State Route 17C, Mail Drop 0220, Owego, NY 13827 O 607-751-4206 | E john.hrust...@lmco.com 100 Years of Accelerating Tomorrow ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 Evergreen Compute and compute_memory_grow_pool
On Mon, Jan 20, 2014 at 09:32:11PM +, Hrustich, John wrote: The compute memory pool used by the gallium r600 driver seems to be problematic. The pool looks to be a single radeon buffer object. There could be multiple maps set up into that single buffer object. If there is a need to grow the pool, then the resource associated with the buffer object is destroyed, which results in all of the maps for that buffer object also being destroyed. When the new larger pool is created, the pointers that the application has to the mapped region are no longer valid. A temporary work-around would appear to be to make sure that the buffer pool is large enough that there isn't a need to grow the pool once any maps into it are created. A longer term solution seems much harder. Even if the maps could all be precisely recreated into the newly allocated buffer object, there would be a period of time when the pointers held by the application would be invalid. This is just one of the many problems with the compute memory pool. It would be good to have some piglit tests for the use case you described. I think the compute code in r600g has stabilized enough now that we could consider replacing the memory pool with something else. I'm open to suggestions if you have any ideas. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI: Support for local memory and derivatives
On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote: On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote: These patches implement enough of local memory support to allow radeonsi to use that for computing derivatives, as suggested by Tom. They also almost allow test/CodeGen/R600/local-memory.ll to generate code for SI. Right now it still fails because it tries to copy a VGPR to an SGPR, which is not possible. Can you add some lit tests for these new intrinsics Done, updated patches attached. and also add CHECK lines for SI to the existing local-memory.ll test. Can't do that while it still fails to generate SI code. Should I commit the other patches anyway, which are only necessary for that test? -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer From 3572bab6a6b5c967d19add0b0497a96123754ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 16:12:45 +0100 Subject: [PATCH v2 1/4] R600/SI: Add intrinsics for texture sampling with user derivatives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- v2: Add lit test lib/Target/R600/SIInstructions.td| 7 +- lib/Target/R600/SIIntrinsics.td | 1 + test/CodeGen/R600/llvm.SI.sampled.ll | 140 +++ 3 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/R600/llvm.SI.sampled.ll diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..c9eac7d 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B; //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027; def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C; //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029; -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a; +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D; //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b; def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L; def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B; @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type { def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type; def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; + + def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; + def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; } defm : SamplePatternsv2i32; diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 224cd2f..d2643e0 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in { def int_SI_sample : Sample; def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; def int_SI_samplel : Sample; def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]; diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll new file mode 100644 index 000..71b8ef5 --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.sampled.ll @@ -0,0 +1,140 @@ +;RUN: llc %s -march=r600 -mcpu=verde | FileCheck %s + +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8 + +define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { + %v1 = insertelement 4 x i32 undef, i32 %a1, i32 0 + %v2 = insertelement 4 x i32 undef, i32 %a1, i32 1 + %v3 = insertelement 4 x i32 undef, i32 %a1, i32 2 + %v4 = insertelement 4 x i32 undef, i32 %a1, i32 3
Re: [Mesa-dev] R600/SI: Support for local memory and derivatives
On Wed, Jul 10, 2013 at 12:32:25PM +0200, Michel Dänzer wrote: On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote: On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote: These patches implement enough of local memory support to allow radeonsi to use that for computing derivatives, as suggested by Tom. They also almost allow test/CodeGen/R600/local-memory.ll to generate code for SI. Right now it still fails because it tries to copy a VGPR to an SGPR, which is not possible. Can you add some lit tests for these new intrinsics Done, updated patches attached. and also add CHECK lines for SI to the existing local-memory.ll test. Can't do that while it still fails to generate SI code. Should I commit the other patches anyway, which are only necessary for that test? Can you add a TODO comment to that test for adding SI checks? With that change, the patches are: Reviewed-by: Tom Stellard thomas.stell...@amd.com -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer From 3572bab6a6b5c967d19add0b0497a96123754ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 16:12:45 +0100 Subject: [PATCH v2 1/4] R600/SI: Add intrinsics for texture sampling with user derivatives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- v2: Add lit test lib/Target/R600/SIInstructions.td| 7 +- lib/Target/R600/SIIntrinsics.td | 1 + test/CodeGen/R600/llvm.SI.sampled.ll | 140 +++ 3 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/R600/llvm.SI.sampled.ll diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..c9eac7d 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B; //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027; def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C; //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029; -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a; +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D; //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b; def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L; def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B; @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type { def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type; def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; + + def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; + def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; } defm : SamplePatternsv2i32; diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 224cd2f..d2643e0 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in { def int_SI_sample : Sample; def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; def int_SI_samplel : Sample; def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]; diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll new file mode 100644 index 000..71b8ef5 --- /dev/null +++ b/test/CodeGen/R600/llvm.SI.sampled.ll @@ -0,0 +1,140 @@ +;RUN: llc %s -march=r600 -mcpu=verde | FileCheck %s + +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13 +;CHECK: IMAGE_SAMPLE_D
Re: [Mesa-dev] R600/SI: Support for local memory and derivatives
On Mit, 2013-07-10 at 08:15 -0700, Tom Stellard wrote: On Wed, Jul 10, 2013 at 12:32:25PM +0200, Michel Dänzer wrote: On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote: On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote: These patches implement enough of local memory support to allow radeonsi to use that for computing derivatives, as suggested by Tom. They also almost allow test/CodeGen/R600/local-memory.ll to generate code for SI. Right now it still fails because it tries to copy a VGPR to an SGPR, which is not possible. Can you add some lit tests for these new intrinsics Done, updated patches attached. and also add CHECK lines for SI to the existing local-memory.ll test. Can't do that while it still fails to generate SI code. Should I commit the other patches anyway, which are only necessary for that test? Can you add a TODO comment to that test for adding SI checks? With that change, the patches are: Reviewed-by: Tom Stellard thomas.stell...@amd.com Thanks, I managed to enable basic lit testing after all, see the attached patches 4 and 5. -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer From 0f11058228a2c6504ed78f9856e6de3f8af0c0e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Wed, 19 Jun 2013 11:01:00 +0200 Subject: [PATCH 4/5] R600/SI: Add pattern for the AMDGPU.barrier.local intrinsic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lit test coverage to follow in the next commit. Reviewed-by: Tom Stellard thomas.stell...@amd.com Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIInstructions.td | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 61755b4..30f2a4a 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -774,8 +774,17 @@ def S_CBRANCH_EXECNZ : SOPP } // End isBranch = 1 } // End isTerminator = 1 -//def S_BARRIER : SOPP_ 0x000a, S_BARRIER, []; let hasSideEffects = 1 in { +def S_BARRIER : SOPP 0x000a, (ins), S_BARRIER, + [(int_AMDGPU_barrier_local)] + { + let SIMM16 = 0; + let isBarrier = 1; + let hasCtrlDep = 1; + let mayLoad = 1; + let mayStore = 1; +} + def S_WAITCNT : SOPP 0x000c, (ins i32imm:$simm16), S_WAITCNT $simm16, [] ; -- 1.8.3.2 From 09715a4574c2e35b02176516f542bc0d1d0dc132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Mon, 17 Jun 2013 12:21:29 +0200 Subject: [PATCH v2 5/5] R600/SI: Initial local memory support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enough for the radeonsi driver to use it for calculating derivatives. Reviewed-by: Tom Stellard thomas.stell...@amd.com Signed-off-by: Michel Dänzer michel.daen...@amd.com --- v2: Enable some lit testing of local memory on SI. lib/Target/R600/AMDGPUAsmPrinter.cpp | 7 +++ lib/Target/R600/AMDGPUISelLowering.cpp| 4 +- lib/Target/R600/R600ISelLowering.cpp | 2 + lib/Target/R600/SIDefines.h | 4 ++ lib/Target/R600/SIISelLowering.cpp| 5 ++ lib/Target/R600/SIInstructions.td | 15 ++ test/CodeGen/R600/local-memory-two-objects.ll | 51 test/CodeGen/R600/local-memory.ll | 67 ++- 8 files changed, 100 insertions(+), 55 deletions(-) create mode 100644 test/CodeGen/R600/local-memory-two-objects.ll diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 996d2a6..e039b77 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -233,7 +233,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction MF) { OutStreamer.EmitIntValue(RsrcReg, 4); OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4); + + if (MFI-ShaderType == ShaderType::COMPUTE) { +OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); +OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(RoundUpToAlignment(MFI-LDSSize, 256) 8), 4); + } if (MFI-ShaderType == ShaderType::PIXEL) { +OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); +OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(RoundUpToAlignment(MFI-LDSSize, 256) 8), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer.EmitIntValue(MFI-PSInputAddr, 4); } diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4019a1f..7fad3bb 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -72,8 +72,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) :
Re: [Mesa-dev] R600/SI: Initial double precision support for Radeon SI
Hi Tom, All these patches look good to me, but #2 and #6 should have a test case with them. If you resubmit these patches with test cases, I will push the entire series. I have attached an updated patchset. I have added a test case to patch #2 and #6. I have also replaced the scalar move in patch #2 by a vector move since there is probably no point in having a floating point value in a scalar register. Kind regards, OleFrom 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001 From: Niels Ole Salscheider niels_...@salscheider-online.de Date: Sat, 1 Jun 2013 16:48:56 +0200 Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI --- lib/Target/R600/AMDGPUISelLowering.cpp | 6 ++ lib/Target/R600/SIISelLowering.cpp | 1 + lib/Target/R600/SIInstructions.td | 30 +- test/CodeGen/R600/fadd64.ll| 13 + test/CodeGen/R600/fdiv64.ll| 14 ++ test/CodeGen/R600/fmul64.ll| 13 + test/CodeGen/R600/load64.ll| 20 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-) create mode 100644 test/CodeGen/R600/fadd64.ll create mode 100644 test/CodeGen/R600/fdiv64.ll create mode 100644 test/CodeGen/R600/fmul64.ll create mode 100644 test/CodeGen/R600/load64.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4019a1f..5f3d496 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::f64, Promote); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 9d4cfef..0d17a12 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..b956387 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] ; defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, []; -defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, []; +defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, + [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] +; defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, []; defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, []; defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, []; @@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64, ; def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, []; +let isCommutable = 1 in { + def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, []; def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, []; def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, []; def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, []; + +} // isCommutable = 1 + +def : Pat + (fadd f64:$src0, f64:$src1), + (V_ADD_F64 $src0, $src1, (i64 0)) +; + +def : Pat + (fmul f64:$src0, f64:$src1), + (V_MUL_F64 $src0, $src1, (i64 0)) +; + def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, []; let isCommutable = 1 in { @@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32; def : BitConvert f32, i32, SReg_32; def : BitConvert f32, i32, VReg_32; +def : BitConvert i64, f64, VReg_64; + +def : BitConvert f64, i64, VReg_64; + /** === **/ /** Src Dst modifiers **/ /** === **/ @@ -1505,6 +1526,11 @@ def : Pat (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) ; +def : Pat + (fdiv f64:$src0, f64:$src1), + (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) +; + def : Pat (fcos f32:$src0), (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) @@ -1634,6 +1660,8 @@ multiclass MUBUFLoad_Pattern MUBUF Instr_ADDR64, ValueType vt, ; } +defm : MUBUFLoad_Pattern
Re: [Mesa-dev] R600/SI: Initial double precision support for Radeon SI
On Tue, Jul 02, 2013 at 10:44:10AM +0200, Niels Ole Salscheider wrote: Hi, the attached patches add initial support for double precision operations on Southern Islands cards. Some expressions containing multiple double precision kernel arguments cause llvm to run until all memory is used - but I do not (yet) know why. It works fine as long as I pass pointers to double values. I may have an idea about why this is happening. Could you file a bug report and attach an LLVM IR test case? All these patches look good to me, but #2 and #6 should have a test case with them. If you resubmit these patches with test cases, I will push the entire series. Nice work! -Tom Regards, Ole From 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001 From: Niels Ole Salscheider niels_...@salscheider-online.de Date: Sat, 1 Jun 2013 16:48:56 +0200 Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI --- lib/Target/R600/AMDGPUISelLowering.cpp | 6 ++ lib/Target/R600/SIISelLowering.cpp | 1 + lib/Target/R600/SIInstructions.td | 30 +- test/CodeGen/R600/fadd64.ll| 13 + test/CodeGen/R600/fdiv64.ll| 14 ++ test/CodeGen/R600/fmul64.ll| 13 + test/CodeGen/R600/load64.ll| 20 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-) create mode 100644 test/CodeGen/R600/fadd64.ll create mode 100644 test/CodeGen/R600/fdiv64.ll create mode 100644 test/CodeGen/R600/fmul64.ll create mode 100644 test/CodeGen/R600/load64.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4019a1f..5f3d496 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::f64, Promote); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 9d4cfef..0d17a12 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..b956387 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] ; defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, []; -defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, []; +defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, + [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] +; defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, []; defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, []; defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, []; @@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64, ; def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, []; +let isCommutable = 1 in { + def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, []; def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, []; def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, []; def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, []; + +} // isCommutable = 1 + +def : Pat + (fadd f64:$src0, f64:$src1), + (V_ADD_F64 $src0, $src1, (i64 0)) +; + +def : Pat + (fmul f64:$src0, f64:$src1), + (V_MUL_F64 $src0, $src1, (i64 0)) +; + def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, []; let isCommutable = 1 in { @@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32; def : BitConvert f32, i32, SReg_32; def : BitConvert f32, i32, VReg_32; +def : BitConvert i64, f64, VReg_64; + +def : BitConvert f64, i64, VReg_64; + /** === **/ /** Src Dst modifiers **/ /** === **/ @@ -1505,6 +1526,11 @@ def : Pat
[Mesa-dev] R600/SI: Initial double precision support for Radeon SI
Hi, the attached patches add initial support for double precision operations on Southern Islands cards. Some expressions containing multiple double precision kernel arguments cause llvm to run until all memory is used - but I do not (yet) know why. It works fine as long as I pass pointers to double values. Regards, OleFrom 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001 From: Niels Ole Salscheider niels_...@salscheider-online.de Date: Sat, 1 Jun 2013 16:48:56 +0200 Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI --- lib/Target/R600/AMDGPUISelLowering.cpp | 6 ++ lib/Target/R600/SIISelLowering.cpp | 1 + lib/Target/R600/SIInstructions.td | 30 +- test/CodeGen/R600/fadd64.ll| 13 + test/CodeGen/R600/fdiv64.ll| 14 ++ test/CodeGen/R600/fmul64.ll| 13 + test/CodeGen/R600/load64.ll| 20 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-) create mode 100644 test/CodeGen/R600/fadd64.ll create mode 100644 test/CodeGen/R600/fdiv64.ll create mode 100644 test/CodeGen/R600/fmul64.ll create mode 100644 test/CodeGen/R600/load64.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4019a1f..5f3d496 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::f64, Promote); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + setOperationAction(ISD::LOAD, MVT::f32, Promote); AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 9d4cfef..0d17a12 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..b956387 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] ; defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, []; -defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, []; +defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, + [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] +; defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, []; defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, []; defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, []; @@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64, ; def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, []; +let isCommutable = 1 in { + def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, []; def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, []; def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, []; def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, []; + +} // isCommutable = 1 + +def : Pat + (fadd f64:$src0, f64:$src1), + (V_ADD_F64 $src0, $src1, (i64 0)) +; + +def : Pat + (fmul f64:$src0, f64:$src1), + (V_MUL_F64 $src0, $src1, (i64 0)) +; + def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, []; let isCommutable = 1 in { @@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32; def : BitConvert f32, i32, SReg_32; def : BitConvert f32, i32, VReg_32; +def : BitConvert i64, f64, VReg_64; + +def : BitConvert f64, i64, VReg_64; + /** === **/ /** Src Dst modifiers **/ /** === **/ @@ -1505,6 +1526,11 @@ def : Pat (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) ; +def : Pat + (fdiv f64:$src0, f64:$src1), + (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) +; + def : Pat (fcos f32:$src0), (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) @@ -1634,6 +1660,8 @@ multiclass MUBUFLoad_Pattern MUBUF Instr_ADDR64, ValueType vt, ; } +defm : MUBUFLoad_Pattern BUFFER_LOAD_DWORDX2_ADDR64, i64, + global_load, constant_load; defm : MUBUFLoad_Pattern
Re: [Mesa-dev] R600/SI: Support for local memory and derivatives
On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote: These patches implement enough of local memory support to allow radeonsi to use that for computing derivatives, as suggested by Tom. They also almost allow test/CodeGen/R600/local-memory.ll to generate code for SI. Right now it still fails because it tries to copy a VGPR to an SGPR, which is not possible. Can you add some lit tests for these new intrinsics and also add CHECK lines for SI to the existing local-memory.ll test. With the tests added, these patches are: Reviewed-by: Tom Stellard thomas.stell...@amd.com -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer From f4ca359c4536aa53122b654196f2e007d50976f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 16:12:45 +0100 Subject: [PATCH 1/6] R600/SI: Add intrinsics for texture sampling with user derivatives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIInstructions.td | 7 ++- lib/Target/R600/SIIntrinsics.td | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..c9eac7d 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B; //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027; def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C; //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029; -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a; +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D; //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b; def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L; def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B; @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type { def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type; def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; + + def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; + def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; } defm : SamplePatternsv2i32; diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 224cd2f..d2643e0 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in { def int_SI_sample : Sample; def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; def int_SI_samplel : Sample; def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]; -- 1.8.3.1 From 7a0048bb2ab1b661831da2b764bf1a52f66bec15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 18:51:38 +0100 Subject: [PATCH v3 2/6] R600/SI: Initial support for LDS/GDS instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- v3: Drop vdst operand from DS_Store_Helper class, and adapt SIInsertWaits::getHwCounts() to handle that. Unfortunately, this seems to mess up the asm string output somehow, not sure what's going on there. lib/Target/R600/SIInsertWaits.cpp | 2 ++ lib/Target/R600/SIInstrFormats.td | 24 lib/Target/R600/SIInstrInfo.td | 23 +++ lib/Target/R600/SIInstructions.td | 3 +++ lib/Target/R600/SILowerControlFlow.cpp | 16 5 files changed, 68 insertions(+) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index c36e1dc..d31da45 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -134,6 +134,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr MI) { if (TSFlags SIInstrFlags::LGKM_CNT) { MachineOperand Op = MI.getOperand(0); +if (!Op.isReg()) + Op = MI.getOperand(1); assert(Op.isReg() First LGKM operand must be a register!); unsigned Reg = Op.getReg(); diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 51f323d..434aa7e 100644 ---
Re: [Mesa-dev] R600 Patches: KCache kernel arguments and 24-bit arithmetic
The whole serie is : reviewed-by:Vincent Lejeune vljn at ovi.com In a future patch we might also remove the ISD::BUILD_VECTOR case in the Select() function and use a tablegen pattern ; I wrote it because we lowered r600.load.input intrinsic to a raw register ; however now we lower it to a copy from a register which should be convertible to a REG_SEQUENCE. Vincent - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Mardi 25 juin 2013 23h37 Objet : [Mesa-dev] R600 Patches: KCache kernel arguments and 24-bit arithmetic Hi, The attached patches clean up kernel argument handling for both R600 and SI and for R600 makes it possible to read arguments through the KCache. There are also patches that add support for the 24-bit arithmetic instructions (MAD_UINT24, MAD_INT24, MUL_UINT24, and MUL_INT24). In order to test these patches with you will also need to apply the corresponding Mesa patches which will be on the mailing list soon. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patches: Add support for the local address space
Hi Vincent, Here is an updated version of patch #3. -Tom On Fri, Jun 14, 2013 at 08:35:03AM -0700, Vincent Lejeune wrote: Hi, Thank for your work on this ! Patch 2, 4 and 5 have my rb. diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b9da74c..6de47f7 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -133,6 +133,12 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const { bool R600InstrInfo::isALUInstr(unsigned Opcode) const { unsigned TargetFlags = get(Opcode).TSFlags; + return (TargetFlags R600_InstFlag::ALU_INST); +} + +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + return ((TargetFlags R600_InstFlag::OP1) | (TargetFlags R600_InstFlag::OP2) | (TargetFlags R600_InstFlag::OP3)); Function prototype is not defined here (it is defined in patch 5). diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index a330d88..acc1b4d 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -269,10 +269,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { } // Does the instruction take a whole IG ? +// XXX: Is it possible to add a helper function in R600InstrInfo that can +// be used here and in R600PacketizerList::isSoloInstruction() ? if(TII-isVector(*MI) || TII-isCubeOp(MI-getOpcode()) || -TII-isReductionOp(MI-getOpcode())) +TII-isReductionOp(MI-getOpcode()) || +MI-getOpcode() == AMDGPU::GROUP_BARRIER) { return AluT_XYZW; +} I'm not sure it'll factorize that much code ; R600Packetizer is called after cube/reduction op are lowered by R600Expand pass and thus the isVector/ReductionOp check is useless. I may have left some debug code in isSoloInstruction code though. - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Jeudi 13 juin 2013 2h42 Objet : [Mesa-dev] R600 Patches: Add support for the local address space Hi, The attached patches add support for local address space on Evergreen / Northern Islands GPUs. Please Review. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev From e5c9de74bcd7625b954aa3f070e4cb9a4b920c85 Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stell...@amd.com Date: Wed, 12 Jun 2013 09:02:39 -0700 Subject: [PATCH] R600: Add ALUInst bit to tablegen definitions v2 v2: - Remove functions left over from a previous rebase. --- lib/Target/R600/R600Defines.h | 3 ++- lib/Target/R600/R600InstrFormats.td | 2 ++ lib/Target/R600/R600InstrInfo.cpp | 4 +--- lib/Target/R600/R600Instructions.td | 3 +++ 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index e30ea27..6bcf8ae 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -41,7 +41,8 @@ namespace R600_InstFlag { OP1 = (1 10), OP2 = (1 11), VTX_INST = (1 12), -TEX_INST = (1 13) +TEX_INST = (1 13), +ALU_INST = (1 14) }; } diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td index d31f18c..2c98fb9 100644 --- a/lib/Target/R600/R600InstrFormats.td +++ b/lib/Target/R600/R600InstrFormats.td @@ -26,6 +26,7 @@ class InstR600 dag outs, dag ins, string asm, listdag pattern, bit HasNativeOperands = 0; bit VTXInst = 0; bit TEXInst = 0; + bit ALUInst = 0; let Namespace = AMDGPU; let OutOperandList = outs; @@ -47,6 +48,7 @@ class InstR600 dag outs, dag ins, string asm, listdag pattern, let TSFlags{11} = Op2; let TSFlags{12} = VTXInst; let TSFlags{13} = TEXInst; + let TSFlags{14} = ALUInst; } //===--===// diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index d17425f..f267ee9 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -133,9 +133,7 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const { bool R600InstrInfo::isALUInstr(unsigned Opcode) const { unsigned TargetFlags = get(Opcode).TSFlags; - return ((TargetFlags R600_InstFlag::OP1) | - (TargetFlags R600_InstFlag::OP2) | - (TargetFlags R600_InstFlag::OP3)); + return (TargetFlags R600_InstFlag::ALU_INST); } bool R600InstrInfo::isTransOnly(unsigned Opcode) const { diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index d819d44..b0a82ff 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600
Re: [Mesa-dev] R600: Expand integer operations for SI and consolidate code with EG
On Thu, Jun 20, 2013 at 06:43:38PM -0500, Aaron Watry wrote: This series is intended to bring SI closer to evergreen when it comes to support for v2i32/v4i32 integer operations. It adds support for expanding the following v2i32/v4i32 operations on SI: AND, MUL, OR, SHL, SRL, ASHR, UDIV, UREM, XOR Once that's done, the setOperationAction(op,type,Expand) calls that appear in both R600ISelLowering.cpp and SIISelLowering.cpp are all moved to AMDGPUISelLowering.cpp. If we decide to implement these ops through native instructions for either target in the future, we can override that in the individual targets. Signed-off-by: Aaron Watry awa...@gmail.com Just one small comment on the SHL patch, but with that fixed these patches are: Reviewed-by: Tom Stellard thomas.stell...@amd.com R600/SI: Expand and of v2i32/v4i32 for SI R600/SI: Expand mul of v2i32/v4i32 for SI R600/SI: Expand or of v2i32/v4i32 for SI R600/SI: Expand shl of v2i32/v4i32 for SI R600/SI: Expand srl of v2i32/v4i32 for SI R600/SI: Expand ashr of v2i32/v4i32 for SI R600/SI: Expand udiv v[24]i32 for SI and v2i32 for EG R600/SI: Expand urem of v2i32/v4i32 for SI R600: Add v2i32 test for setcc on evergreen R600/SI: Expand xor v2i32/v4i32 R600: Add v2i32 test for vselect R600: Consolidate expansion of v2i32/v4i32 ops for SI/EG lib/Target/R600/AMDGPUISelLowering.cpp | 22 lib/Target/R600/R600ISelLowering.cpp | 18 - lib/Target/R600/SIISelLowering.cpp | 5 test/CodeGen/R600/and.ll | 37 +- test/CodeGen/R600/mul.ll | 38 ++- test/CodeGen/R600/or.ll| 41 - test/CodeGen/R600/setcc.ll | 25 +++--- test/CodeGen/R600/shl.ll | 47 ++ test/CodeGen/R600/sra.ll | 41 - test/CodeGen/R600/srl.ll | 42 +- test/CodeGen/R600/udiv.ll | 25 +++--- test/CodeGen/R600/urem.ll | 27 --- test/CodeGen/R600/vselect.ll | 26 ++- test/CodeGen/R600/xor.ll | 40 - 14 files changed, 345 insertions(+), 89 deletions(-) ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600: Expand integer operations for SI and consolidate code with EG
This series is intended to bring SI closer to evergreen when it comes to support for v2i32/v4i32 integer operations. It adds support for expanding the following v2i32/v4i32 operations on SI: AND, MUL, OR, SHL, SRL, ASHR, UDIV, UREM, XOR Once that's done, the setOperationAction(op,type,Expand) calls that appear in both R600ISelLowering.cpp and SIISelLowering.cpp are all moved to AMDGPUISelLowering.cpp. If we decide to implement these ops through native instructions for either target in the future, we can override that in the individual targets. Signed-off-by: Aaron Watry awa...@gmail.com R600/SI: Expand and of v2i32/v4i32 for SI R600/SI: Expand mul of v2i32/v4i32 for SI R600/SI: Expand or of v2i32/v4i32 for SI R600/SI: Expand shl of v2i32/v4i32 for SI R600/SI: Expand srl of v2i32/v4i32 for SI R600/SI: Expand ashr of v2i32/v4i32 for SI R600/SI: Expand udiv v[24]i32 for SI and v2i32 for EG R600/SI: Expand urem of v2i32/v4i32 for SI R600: Add v2i32 test for setcc on evergreen R600/SI: Expand xor v2i32/v4i32 R600: Add v2i32 test for vselect R600: Consolidate expansion of v2i32/v4i32 ops for SI/EG lib/Target/R600/AMDGPUISelLowering.cpp | 22 lib/Target/R600/R600ISelLowering.cpp | 18 - lib/Target/R600/SIISelLowering.cpp | 5 test/CodeGen/R600/and.ll | 37 +- test/CodeGen/R600/mul.ll | 38 ++- test/CodeGen/R600/or.ll| 41 - test/CodeGen/R600/setcc.ll | 25 +++--- test/CodeGen/R600/shl.ll | 47 ++ test/CodeGen/R600/sra.ll | 41 - test/CodeGen/R600/srl.ll | 42 +- test/CodeGen/R600/udiv.ll | 25 +++--- test/CodeGen/R600/urem.ll | 27 --- test/CodeGen/R600/vselect.ll | 26 ++- test/CodeGen/R600/xor.ll | 40 - 14 files changed, 345 insertions(+), 89 deletions(-) ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600/SI: Support for local memory and derivatives
These patches implement enough of local memory support to allow radeonsi to use that for computing derivatives, as suggested by Tom. They also almost allow test/CodeGen/R600/local-memory.ll to generate code for SI. Right now it still fails because it tries to copy a VGPR to an SGPR, which is not possible. -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer From f4ca359c4536aa53122b654196f2e007d50976f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 16:12:45 +0100 Subject: [PATCH 1/6] R600/SI: Add intrinsics for texture sampling with user derivatives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIInstructions.td | 7 ++- lib/Target/R600/SIIntrinsics.td | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9c96c08..c9eac7d 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B; //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027; def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C; //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029; -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a; +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D; //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b; def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L; def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B; @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type { def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type; def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; + + def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; + def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; } defm : SamplePatternsv2i32; diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 224cd2f..d2643e0 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in { def int_SI_sample : Sample; def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; def int_SI_samplel : Sample; def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]; -- 1.8.3.1 From 7a0048bb2ab1b661831da2b764bf1a52f66bec15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 18:51:38 +0100 Subject: [PATCH v3 2/6] R600/SI: Initial support for LDS/GDS instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- v3: Drop vdst operand from DS_Store_Helper class, and adapt SIInsertWaits::getHwCounts() to handle that. Unfortunately, this seems to mess up the asm string output somehow, not sure what's going on there. lib/Target/R600/SIInsertWaits.cpp | 2 ++ lib/Target/R600/SIInstrFormats.td | 24 lib/Target/R600/SIInstrInfo.td | 23 +++ lib/Target/R600/SIInstructions.td | 3 +++ lib/Target/R600/SILowerControlFlow.cpp | 16 5 files changed, 68 insertions(+) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index c36e1dc..d31da45 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -134,6 +134,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr MI) { if (TSFlags SIInstrFlags::LGKM_CNT) { MachineOperand Op = MI.getOperand(0); +if (!Op.isReg()) + Op = MI.getOperand(1); assert(Op.isReg() First LGKM operand must be a register!); unsigned Reg = Op.getReg(); diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 51f323d..434aa7e 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -281,6 +281,30 @@ class VINTRP bits 2 op, dag outs, dag ins, string asm, listdag pattern : let Uses = [EXEC] in { +class DS bits8 op, dag outs, dag ins, string asm, listdag pattern : +Enc64 outs, ins, asm, pattern { + + bits8 vdst; + bits1 gds; + bits8 addr; + bits8 data0; + bits8 data1; + bits8 offset0; + bits8 offset1; + + let
[Mesa-dev] R600: Bugfixes
Hi, these patches fix 2 bugs in R600 backend. The first one use the rv710/rv730 correct encoding for TEX clause with more than 8 instructions. This bug has been spoted there : https://bugs.freedesktop.org/show_bug.cgi?id=64257 The other patch fix a typo that causes instructions not to use PV/PS register when R600Packetizers evaluates read port limitations. It prevents some bundling opportunities in some (not so frequent) situation. Vincent 0001-R600-Properly-set-COUNT_3-bit-in-TEX-clause-initiati.patch Description: Binary data 0002-R600-PV-stores-Reg-id-not-index.patch Description: Binary data ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600: Bugfixes
On Mon, Jun 17, 2013 at 9:43 AM, Vincent Lejeune v...@ovi.com wrote: Hi, these patches fix 2 bugs in R600 backend. The first one use the rv710/rv730 correct encoding for TEX clause with more than 8 instructions. This bug has been spoted there : https://bugs.freedesktop.org/show_bug.cgi?id=64257 The other patch fix a typo that causes instructions not to use PV/PS register when R600Packetizers evaluates read port limitations. It prevents some bundling opportunities in some (not so frequent) situation. Reviewed-by: Alex Deucher alexander.deuc...@amd.com ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600: Bugfixes
On Mon, Jun 17, 2013 at 06:43:09AM -0700, Vincent Lejeune wrote: Hi, these patches fix 2 bugs in R600 backend. The first one use the rv710/rv730 correct encoding for TEX clause with more than 8 instructions. This bug has been spoted there : https://bugs.freedesktop.org/show_bug.cgi?id=64257 The other patch fix a typo that causes instructions not to use PV/PS register when R600Packetizers evaluates read port limitations. It prevents some bundling opportunities in some (not so frequent) situation. Vincent Both patches are Reviewed-by: Tom Stellard thomas.stell...@amd.com Can you add the bugzilla link to the commit message of patch #1 ? -Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600: Various fixes for R600 and SI
First patch fixes load/store for v2i32 on R600. Without this, the other two will cause make check failures. I've verified the changes using a Radeon 5400 (Cedar). Note that the previous custom lowering of v2i32 store was causing silent data corruption. The other two patches expand add/sub on SI for both v2i32 and v4i32 types. There's lit tests for v2i32 that have been added. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patches: Add support for the local address space
Hi, Thank for your work on this ! Patch 2, 4 and 5 have my rb. diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b9da74c..6de47f7 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -133,6 +133,12 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const { bool R600InstrInfo::isALUInstr(unsigned Opcode) const { unsigned TargetFlags = get(Opcode).TSFlags; + return (TargetFlags R600_InstFlag::ALU_INST); +} + +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const { + unsigned TargetFlags = get(Opcode).TSFlags; + return ((TargetFlags R600_InstFlag::OP1) | (TargetFlags R600_InstFlag::OP2) | (TargetFlags R600_InstFlag::OP3)); Function prototype is not defined here (it is defined in patch 5). diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index a330d88..acc1b4d 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -269,10 +269,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { } // Does the instruction take a whole IG ? +// XXX: Is it possible to add a helper function in R600InstrInfo that can +// be used here and in R600PacketizerList::isSoloInstruction() ? if(TII-isVector(*MI) || TII-isCubeOp(MI-getOpcode()) || -TII-isReductionOp(MI-getOpcode())) +TII-isReductionOp(MI-getOpcode()) || +MI-getOpcode() == AMDGPU::GROUP_BARRIER) { return AluT_XYZW; +} I'm not sure it'll factorize that much code ; R600Packetizer is called after cube/reduction op are lowered by R600Expand pass and thus the isVector/ReductionOp check is useless. I may have left some debug code in isSoloInstruction code though. - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Jeudi 13 juin 2013 2h42 Objet : [Mesa-dev] R600 Patches: Add support for the local address space Hi, The attached patches add support for local address space on Evergreen / Northern Islands GPUs. Please Review. -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patches: Add support for the local address space
On Wed, Jun 12, 2013 at 06:37:39PM -0700, Matt Arsenault wrote: On 06/12/2013 05:42 PM, Tom Stellard wrote: Hi, The attached patches add support for local address space on Evergreen / Northern Islands GPUs. Please Review. -Tom + def int_AMDGPU_barrier_local : Intrinsic[], [], []; You probably want to mark this as IntrReadMem to try to avoid reordering stores around the barrier I don't think the intrinsic as defined will have stores reordered around it. From include/llvm/IR/Intrinsics.td: // Intr*Mem - Memory properties. An intrinsic is allowed to have at most one of // these properties set. They are listed from the most aggressive (best to use // if correct) to the least aggressive. If no property is set, the worst case // is assumed (it may read and write any memory it can get access to and // it may have other side effects). -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patches: Add support for the local address space
On 06/12/2013 05:42 PM, Tom Stellard wrote: Hi, The attached patches add support for local address space on Evergreen / Northern Islands GPUs. Please Review. -Tom + def int_AMDGPU_barrier_local : Intrinsic[], [], []; You probably want to mark this as IntrReadMem to try to avoid reordering stores around the barrier ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI: Intrinsics for derivatives
On Sam, 2013-06-08 at 20:08 -0400, Tom Stellard wrote: On Fri, Jun 07, 2013 at 05:48:05PM -0700, Tom Stellard wrote: On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote: @@ -1544,6 +1562,26 @@ def : Pat sub3) ; +class DDXY Intrinsic name, bits4 ldsdelta : Pat + (name v4f32:$src, imm, imm, imm), + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), +(SI_DD (EXTRACT_SUBREG $src, sub0), (V_LSHLREV_B32_e32 2, (SI_TID)), + (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))), + ldsdelta), sub0), +(SI_DD (EXTRACT_SUBREG $src, sub1), (V_LSHLREV_B32_e32 2, (SI_TID)), + (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))), + ldsdelta), sub1), +(SI_DD (EXTRACT_SUBREG $src, sub2), (V_LSHLREV_B32_e32 2, (SI_TID)), + (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))), + ldsdelta), sub2), +(SI_DD (EXTRACT_SUBREG $src, sub3), (V_LSHLREV_B32_e32 2, (SI_TID)), + (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))), + ldsdelta), sub3) +; Based on this pattern, I don't think you need to use a ddx/ddy intrinsic here. All of the instructions you are lowering DDX/DDY to have an equivalent LLVM IR instruction or LLVM intrinsic. For the DS_READ and DS_WRITE instructions all you need to do is emit load/stores to the local address space and then add patterns for those int the backend. As an added bonus this will add support for OpenCL local address spaces. I think the rest of the instructions are pretty straight forward (unless I've overlooked something). Let me know if you have any questions. I did overlook something. You will need to add an intrinsic for thread id in order to implement ddx/ddy completely in LLVM IR, but I still think it is the best way. Shoot, I was just happy I finally got all the piglit tests passing. :) But I agree your suggested approach would be better, I'll give it a go. -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI: Intrinsics for derivatives
On Fri, Jun 07, 2013 at 05:48:05PM -0700, Tom Stellard wrote: On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote: The most important difference to the previous version of these is that whole quad mode is now enabled and M0 initialized appropriately for the LDS instructions, which now allows all of the relevant piglit tests to pass. Hi Michel, After I gave this series my r-b, I was reviewing your Mesa patches, and I suddenly had an idea for a better way to implement this. See my comments below: From bb5adcd52cc5cadc308e85f635675199f5c02f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 17:56:22 +0100 Subject: [PATCH 3/3] R600/SI: Support AMDGPU.ddx/y intrinsics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use LDS for calculating the deltas between neighbouring pixels. Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 77 +- lib/Target/R600/SIISelLowering.h | 6 +++ lib/Target/R600/SIInstructions.td | 42 - 3 files changed, 121 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index ac6a4c3..7ea226a 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -249,7 +249,7 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - + MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); MachineBasicBlock::iterator I = *MI; switch (MI-getOpcode()) { @@ -257,7 +257,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); unsigned SubRegHi = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -282,10 +281,84 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI-eraseFromParent(); break; } + case AMDGPU::SI_DD: +LowerSI_DD(MI, *BB, I, MRI); +break; + case AMDGPU::SI_TID: +LowerSI_TID(MI, *BB, I, MRI); +break; } return BB; } +void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock BB, +MachineBasicBlock::iterator I, MachineRegisterInfo MRI) const { + unsigned coord0 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass); + unsigned coord1 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass); + MachineOperand dst = MI-getOperand(0); + MachineOperand coord = MI-getOperand(1); + MachineOperand ldsaddr = MI-getOperand(2); + MachineOperand ldsaddr0 = MI-getOperand(3); + MachineOperand ldsdelta = MI-getOperand(4); + + // Write this thread's coordinate to LDS + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_WRITE_B32)) + .addOperand(coord) + .addImm(0) // LDS + .addOperand(ldsaddr) + .addOperand(coord) + .addOperand(coord) + .addImm(0) + .addImm(0); + + // Read top right / bottom left thread's coordinate from LDS + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord0) + .addImm(0) // LDS + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addOperand(ldsdelta) + .addImm(0); + + // Read top left thread's coordinate from LDS + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord1) + .addImm(0) // LDS + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addImm(0) + .addImm(0); + + // Subtract top left coordinate from top right / bottom left + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_SUB_F32_e32)) + .addOperand(dst) + .addReg(coord0) + .addReg(coord1); + + MI-eraseFromParent(); +} + +void SITargetLowering::LowerSI_TID(MachineInstr *MI, MachineBasicBlock BB, +MachineBasicBlock::iterator I, MachineRegisterInfo MRI) const { + unsigned mbcnt_lo = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass); + MachineOperand dst = MI-getOperand(0); + + // Get this thread's ID + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo) + .addImm(0x) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) +
[Mesa-dev] R600/SI: Intrinsics for derivatives
The most important difference to the previous version of these is that whole quad mode is now enabled and M0 initialized appropriately for the LDS instructions, which now allows all of the relevant piglit tests to pass. -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer From db07ab94113be5810fd6d1035b3d394ed53d27ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 16:12:45 +0100 Subject: [PATCH 1/3] R600/SI: Add intrinsics for texture sampling with user derivatives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIInstructions.td | 7 ++- lib/Target/R600/SIIntrinsics.td | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index b6db815..73f87ca 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B; //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027; def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C; //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029; -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a; +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D; //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b; def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L; def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B; @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type { def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type; def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type; + + def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type; + def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; + def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type; } defm : SamplePatternsv2i32; diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 224cd2f..d2643e0 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in { def int_SI_sample : Sample; def int_SI_sampleb : Sample; + def int_SI_sampled : Sample; def int_SI_samplel : Sample; def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]; -- 1.8.3 From 466936a680993dec58e1e537f3b489cd82b5176c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 18:51:38 +0100 Subject: [PATCH 2/3] R600/SI: Initial support for LDS/GDS instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIInstrFormats.td | 24 lib/Target/R600/SIInstrInfo.td | 23 +++ lib/Target/R600/SIInstructions.td | 3 +++ lib/Target/R600/SILowerControlFlow.cpp | 16 4 files changed, 66 insertions(+) diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 51f323d..434aa7e 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -281,6 +281,30 @@ class VINTRP bits 2 op, dag outs, dag ins, string asm, listdag pattern : let Uses = [EXEC] in { +class DS bits8 op, dag outs, dag ins, string asm, listdag pattern : +Enc64 outs, ins, asm, pattern { + + bits8 vdst; + bits1 gds; + bits8 addr; + bits8 data0; + bits8 data1; + bits8 offset0; + bits8 offset1; + + let Inst{7-0} = offset0; + let Inst{15-8} = offset1; + let Inst{17} = gds; + let Inst{25-18} = op; + let Inst{31-26} = 0x36; //encoding + let Inst{39-32} = addr; + let Inst{47-40} = data0; + let Inst{55-48} = data1; + let Inst{63-56} = vdst; + + let LGKM_CNT = 1; +} + class MUBUF bits7 op, dag outs, dag ins, string asm, listdag pattern : Enc64outs, ins, asm, pattern { diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 42fa95f..47a64f7 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -286,6 +286,29 @@ class VOP3_64 bits9 op, string opName, listdag pattern : VOP3 // Vector I/O classes //===--===// +class DS_Load_Helper bits8 op, string asm, RegisterClass regClass : DS + op, + (outs regClass:$vdst), + (ins
Re: [Mesa-dev] R600/SI: Intrinsics for derivatives
On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote: The most important difference to the previous version of these is that whole quad mode is now enabled and M0 initialized appropriately for the LDS instructions, which now allows all of the relevant piglit tests to pass. For the series: Reviewed-by: Tom Stellard thomas.stell...@amd.com ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI: Intrinsics for derivatives
On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote: The most important difference to the previous version of these is that whole quad mode is now enabled and M0 initialized appropriately for the LDS instructions, which now allows all of the relevant piglit tests to pass. Hi Michel, After I gave this series my r-b, I was reviewing your Mesa patches, and I suddenly had an idea for a better way to implement this. See my comments below: From bb5adcd52cc5cadc308e85f635675199f5c02f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com Date: Thu, 21 Feb 2013 17:56:22 +0100 Subject: [PATCH 3/3] R600/SI: Support AMDGPU.ddx/y intrinsics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use LDS for calculating the deltas between neighbouring pixels. Signed-off-by: Michel Dänzer michel.daen...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 77 +- lib/Target/R600/SIISelLowering.h | 6 +++ lib/Target/R600/SIInstructions.td | 42 - 3 files changed, 121 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index ac6a4c3..7ea226a 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -249,7 +249,7 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - + MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); MachineBasicBlock::iterator I = *MI; switch (MI-getOpcode()) { @@ -257,7 +257,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); unsigned SubRegHi = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -282,10 +281,84 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MI-eraseFromParent(); break; } + case AMDGPU::SI_DD: +LowerSI_DD(MI, *BB, I, MRI); +break; + case AMDGPU::SI_TID: +LowerSI_TID(MI, *BB, I, MRI); +break; } return BB; } +void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock BB, +MachineBasicBlock::iterator I, MachineRegisterInfo MRI) const { + unsigned coord0 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass); + unsigned coord1 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass); + MachineOperand dst = MI-getOperand(0); + MachineOperand coord = MI-getOperand(1); + MachineOperand ldsaddr = MI-getOperand(2); + MachineOperand ldsaddr0 = MI-getOperand(3); + MachineOperand ldsdelta = MI-getOperand(4); + + // Write this thread's coordinate to LDS + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_WRITE_B32)) + .addOperand(coord) + .addImm(0) // LDS + .addOperand(ldsaddr) + .addOperand(coord) + .addOperand(coord) + .addImm(0) + .addImm(0); + + // Read top right / bottom left thread's coordinate from LDS + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord0) + .addImm(0) // LDS + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addOperand(ldsdelta) + .addImm(0); + + // Read top left thread's coordinate from LDS + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord1) + .addImm(0) // LDS + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addOperand(ldsaddr0) + .addImm(0) + .addImm(0); + + // Subtract top left coordinate from top right / bottom left + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_SUB_F32_e32)) + .addOperand(dst) + .addReg(coord0) + .addReg(coord1); + + MI-eraseFromParent(); +} + +void SITargetLowering::LowerSI_TID(MachineInstr *MI, MachineBasicBlock BB, +MachineBasicBlock::iterator I, MachineRegisterInfo MRI) const { + unsigned mbcnt_lo = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass); + MachineOperand dst = MI-getOperand(0); + + // Get this thread's ID + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo) + .addImm(0x) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0); + BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_MBCNT_HI_U32_B32_e32)) + .addOperand(dst) + .addImm(0x) + .addReg(mbcnt_lo); + +
Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute
On Mit, 2013-05-15 at 14:26 -0700, Tom Stellard wrote: The attached patches add some new patterns and instructions for SI and are a prerequisite for more invasive compute shader changes that I'm working on. Please Review. The SI changes are Reviewed-by: Michel Dänzer michel.daen...@amd.com -- Earthling Michel Dänzer | http://www.amd.com Libre software enthusiast | Debian, X and DRI developer ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute
Hi, -- next part -- From dc547a89dac5039ce521f3c27fb23346251d488d Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stellard at amd.com Date: Tue, 7 May 2013 16:26:26 -0400 Subject: [PATCH 4/7] R600: Swap the legality of rotl and rotr The hardware supports rotr and not rotl. --- lib/Target/R600/AMDGPUISelLowering.cpp | 3 +++ lib/Target/R600/AMDGPUISelLowering.h | 1 - lib/Target/R600/AMDGPUInstrInfo.td | 6 -- lib/Target/R600/AMDGPUInstructions.td | 6 ++ lib/Target/R600/AMDILISelLowering.cpp | 2 -- lib/Target/R600/R600ISelLowering.cpp | 15 --- lib/Target/R600/R600Instructions.td| 6 ++ test/CodeGen/R600/rotr.ll | 29 + 8 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 test/CodeGen/R600/rotr.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index a266df5..b3c51e3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -46,6 +46,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + // The hardware supports ROTR, but not ROTL + setOperationAction(ISD::ROTL, MVT::i32, Expand); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index c2a79ea..6f8ab8b 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -115,7 +115,6 @@ enum { RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes - BITALIGN, BUFFER_STORE, DWORDADDR, FRACT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index b66ae87..a0a3410 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile1, 3, [ // AMDGPU DAG Nodes // -// out = ((a 32) | b) c) -// -// Can be used to optimize rtol: -// rotl(a, b) = bitalign(a, a, 32 - b) -def AMDGPUbitalign : SDNodeAMDGPUISD::BITALIGN, AMDGPUDTIntTernaryOp; - // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNodeAMDGPUISD::DWORDADDR, SDTIntUnaryOp; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index d2620b2..54df7d0 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -295,6 +295,12 @@ class BFEPattern Instruction BFE : Pat (BFE $x, $y, $z) ; +// rotr pattern +class ROTRPattern Instruction BIT_ALIGN : Pat + (rotr i32:$src0, i32:$src1), + (BIT_ALIGN $src0, $src0, $src1) +; + include R600Instructions.td include SIInstrInfo.td diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 922cac1..e20dbe0 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -138,8 +138,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); -// GPU doesn't have a rotl, rotr, or byteswap instruction -setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); // GPU doesn't have any counting operators diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 7252235..e58a8dd 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::ROTL, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -327,7 +325,6 @@ using namespace llvm::AMDGPUIntrinsic; SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ROTL: return LowerROTL(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); @@ -518,18 +515,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG DAG) const return DAG.getConstant(Offset * 4 * TFL-getStackWidth(MF), MVT::i32); } -SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG DAG) const { - DebugLoc DL = Op.getDebugLoc(); - EVT VT = Op.getValueType(); - - return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT, - Op.getOperand(0), -
Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute
On Thu, May 16, 2013 at 08:21:36AM -0700, Vincent Lejeune wrote: Hi, -- next part -- From dc547a89dac5039ce521f3c27fb23346251d488d Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stellard at amd.com Date: Tue, 7 May 2013 16:26:26 -0400 Subject: [PATCH 4/7] R600: Swap the legality of rotl and rotr The hardware supports rotr and not rotl. --- lib/Target/R600/AMDGPUISelLowering.cpp | 3 +++ lib/Target/R600/AMDGPUISelLowering.h | 1 - lib/Target/R600/AMDGPUInstrInfo.td | 6 -- lib/Target/R600/AMDGPUInstructions.td | 6 ++ lib/Target/R600/AMDILISelLowering.cpp | 2 -- lib/Target/R600/R600ISelLowering.cpp | 15 --- lib/Target/R600/R600Instructions.td| 6 ++ test/CodeGen/R600/rotr.ll | 29 + 8 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 test/CodeGen/R600/rotr.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index a266df5..b3c51e3 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -46,6 +46,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + // The hardware supports ROTR, but not ROTL + setOperationAction(ISD::ROTL, MVT::i32, Expand); + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index c2a79ea..6f8ab8b 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -115,7 +115,6 @@ enum { RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes - BITALIGN, BUFFER_STORE, DWORDADDR, FRACT, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index b66ae87..a0a3410 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile1, 3, [ // AMDGPU DAG Nodes // -// out = ((a 32) | b) c) -// -// Can be used to optimize rtol: -// rotl(a, b) = bitalign(a, a, 32 - b) -def AMDGPUbitalign : SDNodeAMDGPUISD::BITALIGN, AMDGPUDTIntTernaryOp; - // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNodeAMDGPUISD::DWORDADDR, SDTIntUnaryOp; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index d2620b2..54df7d0 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -295,6 +295,12 @@ class BFEPattern Instruction BFE : Pat (BFE $x, $y, $z) ; +// rotr pattern +class ROTRPattern Instruction BIT_ALIGN : Pat + (rotr i32:$src0, i32:$src1), + (BIT_ALIGN $src0, $src0, $src1) +; + include R600Instructions.td include SIInstrInfo.td diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 922cac1..e20dbe0 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -138,8 +138,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); -// GPU doesn't have a rotl, rotr, or byteswap instruction -setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); // GPU doesn't have any counting operators diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 7252235..e58a8dd 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::ROTL, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -327,7 +325,6 @@ using namespace llvm::AMDGPUIntrinsic; SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ROTL: return LowerROTL(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); @@ -518,18 +515,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG DAG) const return DAG.getConstant(Offset * 4 * TFL-getStackWidth(MF), MVT::i32); } -SDValue R600TargetLowering::LowerROTL(SDValue Op,
[Mesa-dev] R600/SI Patches: A few cleanups for compute
Hi, The attached patches add some new patterns and instructions for SI and are a prerequisite for more invasive compute shader changes that I'm working on. Please Review. -Tom From 5b87402d1290df5ec8bdbe1333cadb5739a8c8bd Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stell...@amd.com Date: Mon, 13 May 2013 21:50:35 -0400 Subject: [PATCH 1/7] R600/SI: Make fitsRegClass() operands const --- lib/Target/R600/SIISelLowering.cpp | 2 +- lib/Target/R600/SIISelLowering.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 6bd82a5..d7e2981 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -513,7 +513,7 @@ bool SITargetLowering::foldImm(SDValue Operand, int32_t Immediate, } /// \brief Does Op fit into register class RegClass ? -bool SITargetLowering::fitsRegClass(SelectionDAG DAG, SDValue Op, +bool SITargetLowering::fitsRegClass(SelectionDAG DAG, const SDValue Op, unsigned RegClass) const { MachineRegisterInfo MRI = DAG.getMachineFunction().getRegInfo(); diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index de637be..e9ea68a 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -30,7 +30,8 @@ class SITargetLowering : public AMDGPUTargetLowering { bool foldImm(SDValue Operand, int32_t Immediate, bool ScalarSlotUsed) const; - bool fitsRegClass(SelectionDAG DAG, SDValue Op, unsigned RegClass) const; + bool fitsRegClass(SelectionDAG DAG, const SDValue Op, +unsigned RegClass) const; void ensureSRegLimit(SelectionDAG DAG, SDValue Operand, unsigned RegClass, bool ScalarSlotUsed) const; -- 1.8.1.5 From a2d4b16a0022110c6198ed330966911b2bad3361 Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stell...@amd.com Date: Thu, 9 May 2013 16:44:22 -0400 Subject: [PATCH 2/7] R600/SI: Use the same names for VOP3 operands and encoding fields This makes it possible to reorder the operands without breaking the encoding. --- lib/Target/R600/SIInstrFormats.td | 62 +++ lib/Target/R600/SIInstrInfo.td| 12 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index f737ddd..51f323d 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -185,25 +185,25 @@ class VOP2 bits6 op, dag outs, dag ins, string asm, listdag pattern : class VOP3 bits9 op, dag outs, dag ins, string asm, listdag pattern : Enc64 outs, ins, asm, pattern { - bits8 VDST; - bits9 SRC0; - bits9 SRC1; - bits9 SRC2; - bits3 ABS; - bits1 CLAMP; - bits2 OMOD; - bits3 NEG; - - let Inst{7-0} = VDST; - let Inst{10-8} = ABS; - let Inst{11} = CLAMP; + bits8 dst; + bits9 src0; + bits9 src1; + bits9 src2; + bits3 abs; + bits1 clamp; + bits2 omod; + bits3 neg; + + let Inst{7-0} = dst; + let Inst{10-8} = abs; + let Inst{11} = clamp; let Inst{25-17} = op; let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = SRC0; - let Inst{49-41} = SRC1; - let Inst{58-50} = SRC2; - let Inst{60-59} = OMOD; - let Inst{63-61} = NEG; + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{63-61} = neg; let mayLoad = 0; let mayStore = 0; @@ -213,23 +213,23 @@ class VOP3 bits9 op, dag outs, dag ins, string asm, listdag pattern : class VOP3b bits9 op, dag outs, dag ins, string asm, listdag pattern : Enc64 outs, ins, asm, pattern { - bits8 VDST; - bits9 SRC0; - bits9 SRC1; - bits9 SRC2; - bits7 SDST; - bits2 OMOD; - bits3 NEG; + bits8 dst; + bits9 src0; + bits9 src1; + bits9 src2; + bits7 sdst; + bits2 omod; + bits3 neg; - let Inst{7-0} = VDST; - let Inst{14-8} = SDST; + let Inst{7-0} = dst; + let Inst{14-8} = sdst; let Inst{25-17} = op; let Inst{31-26} = 0x34; //encoding - let Inst{40-32} = SRC0; - let Inst{49-41} = SRC1; - let Inst{58-50} = SRC2; - let Inst{60-59} = OMOD; - let Inst{63-61} = NEG; + let Inst{40-32} = src0; + let Inst{49-41} = src1; + let Inst{58-50} = src2; + let Inst{60-59} = omod; + let Inst{63-61} = neg; let mayLoad = 0; let mayStore = 0; diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index c8aecb7..11c8f9d 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -163,8 +163,8 @@ multiclass VOP1_Helper bits8 op, RegisterClass drc, RegisterClass src, i32imm:$omod, i32imm:$neg), opName#_e64 $dst, $src0, $abs, $clamp, $omod, $neg, [] , VOP opName { -let SRC1 = SIOperand.ZERO; -let SRC2 = SIOperand.ZERO; +let src1 = SIOperand.ZERO; +let src2 = SIOperand.ZERO; } } @@ -189,7 +189,7 @@ multiclass VOP2_Helper bits6 op, RegisterClass
Re: [Mesa-dev] R600: Various improvements
From 8aa41148651150eb19332436c76fe490d4b54b1e Mon Sep 17 00:00:00 2001 From: Vincent Lejeune v...@ovi.com Date: Sun, 12 May 2013 16:29:50 +0200 Subject: [PATCH 1/2] R600: Rename 128 bit registers. Almost all instructions that takes a 128 bits reg as input (fetch, export...) have the abilities to swizzle their argument and output. Instead of printing default swizzle for each 128 bits reg, rename T*.XYZW to T* and let instructions print potentially optimized swizzle themselve. Typo here: swizzle themselve - swizzles themselves Both patches are: Reviewed-by: Tom Stellard thomas.stell...@amd.com --- lib/Target/R600/R600Instructions.td | 17 - lib/Target/R600/R600RegisterInfo.td | 2 +- test/CodeGen/R600/llvm.AMDGPU.tex.ll | 32 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 86e4b4a..abaa94b 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1750,8 +1750,7 @@ let usesCustomInserter = 1 in { class RAT_WRITE_CACHELESS_eg dag ins, bits4 comp_mask, string name, listdag pattern -: EG_CF_RAT 0x57, 0x2, 0, (outs), ins, - !strconcat(name, $rw_gpr, $index_gpr, $eop), pattern { +: EG_CF_RAT 0x57, 0x2, 0, (outs), ins, name, pattern { let RIM = 0; // XXX: Have a separate instruction for non-indexed writes. let TYPE= 1; @@ -1771,19 +1770,19 @@ class RAT_WRITE_CACHELESS_eg dag ins, bits4 comp_mask, string name, // 32-bit store def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - 0x1, RAT_WRITE_CACHELESS_32_eg, + 0x1, RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop, [(global_store i32:$rw_gpr, i32:$index_gpr)] ; //128-bit store def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - 0xf, RAT_WRITE_CACHELESS_128, + 0xf, RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop, [(global_store v4i32:$rw_gpr, i32:$index_gpr)] ; class VTX_READ_eg string name, bits8 buffer_id, dag outs, listdag pattern -: InstR600ISA outs, (ins MEMxi:$ptr), name# $dst, $ptr, pattern, +: InstR600ISA outs, (ins MEMxi:$ptr), name, pattern, VTX_WORD1_GPR, VTX_WORD0 { // Static fields @@ -1838,7 +1837,7 @@ class VTX_READ_eg string name, bits8 buffer_id, dag outs, listdag pattern } class VTX_READ_8_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_8, buffer_id, (outs R600_TReg32_X:$dst), +: VTX_READ_eg VTX_READ_8 $dst, $ptr, buffer_id, (outs R600_TReg32_X:$dst), pattern { let MEGA_FETCH_COUNT = 1; @@ -1850,7 +1849,7 @@ class VTX_READ_8_eg bits8 buffer_id, listdag pattern } class VTX_READ_16_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_16, buffer_id, (outs R600_TReg32_X:$dst), +: VTX_READ_eg VTX_READ_16 $dst, $ptr, buffer_id, (outs R600_TReg32_X:$dst), pattern { let MEGA_FETCH_COUNT = 2; let DST_SEL_X = 0; @@ -1862,7 +1861,7 @@ class VTX_READ_16_eg bits8 buffer_id, listdag pattern } class VTX_READ_32_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_32, buffer_id, (outs R600_TReg32_X:$dst), +: VTX_READ_eg VTX_READ_32 $dst, $ptr, buffer_id, (outs R600_TReg32_X:$dst), pattern { let MEGA_FETCH_COUNT = 4; @@ -1883,7 +1882,7 @@ class VTX_READ_32_eg bits8 buffer_id, listdag pattern } class VTX_READ_128_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_128, buffer_id, (outs R600_Reg128:$dst), +: VTX_READ_eg VTX_READ_128 $dst.XYZW, $ptr, buffer_id, (outs R600_Reg128:$dst), pattern { let MEGA_FETCH_COUNT = 16; diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index bfc546b..df6004b 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -35,7 +35,7 @@ foreach Index = 0-127 in { Chan; } // 128-bit Temporary Registers - def T#Index#_XYZW : R600Reg_128 T#Index#.XYZW, + def T#Index#_XYZW : R600Reg_128 T#Index#, [!castRegister(T#Index#_X), !castRegister(T#Index#_Y), !castRegister(T#Index#_Z), diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll b/test/CodeGen/R600/llvm.AMDGPU.tex.ll index 74331fa..81fd43d 100644 --- a/test/CodeGen/R600/llvm.AMDGPU.tex.ll +++ b/test/CodeGen/R600/llvm.AMDGPU.tex.ll @@ -1,21 +1,21 @@ ;RUN: llc %s -march=r600 -mcpu=redwood | FileCheck %s -;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 1
Re: [Mesa-dev] R600: Various improvements
On Sun, May 12, 2013 at 07:41:21AM -0700, Vincent Lejeune wrote: Hi, Patches 2 and 3 factorizes some code from the backend. Patch 3 should avoid some recomputation too, which shouldn't hurt. Patch 4 and 5 rework how textures are handled in our backend. It replaces TGSI like intrinsic (ie intrinsic that uses last argument as TextureTarget which has no sense from hw pov) to intrinsic closer to hw. The pass could be done in mesa but I rather have it in llvm for now to ensure backward compatibility with llvm 3.3. Hi Vincent, Just some small comments on patches 4 and 5. With those changes, this series is: Reviewed-by: Tom Stellard thomas.stell...@amd.com From 3974315f153e67913f8cc4b4d52550bf6ab33e59 Mon Sep 17 00:00:00 2001 From: Vincent Lejeune v...@ovi.com Date: Sun, 12 May 2013 16:29:50 +0200 Subject: [PATCH 4/5] R600: Rename 128 bit registers. --- lib/Target/R600/R600Instructions.td | 17 - lib/Target/R600/R600RegisterInfo.td | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) What is the reason for renaming these registers? Could you add an explanation to the commit message? diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 86e4b4a..abaa94b 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1750,8 +1750,7 @@ let usesCustomInserter = 1 in { class RAT_WRITE_CACHELESS_eg dag ins, bits4 comp_mask, string name, listdag pattern -: EG_CF_RAT 0x57, 0x2, 0, (outs), ins, - !strconcat(name, $rw_gpr, $index_gpr, $eop), pattern { +: EG_CF_RAT 0x57, 0x2, 0, (outs), ins, name, pattern { let RIM = 0; // XXX: Have a separate instruction for non-indexed writes. let TYPE= 1; @@ -1771,19 +1770,19 @@ class RAT_WRITE_CACHELESS_eg dag ins, bits4 comp_mask, string name, // 32-bit store def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - 0x1, RAT_WRITE_CACHELESS_32_eg, + 0x1, RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop, [(global_store i32:$rw_gpr, i32:$index_gpr)] ; //128-bit store def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), - 0xf, RAT_WRITE_CACHELESS_128, + 0xf, RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop, [(global_store v4i32:$rw_gpr, i32:$index_gpr)] ; class VTX_READ_eg string name, bits8 buffer_id, dag outs, listdag pattern -: InstR600ISA outs, (ins MEMxi:$ptr), name# $dst, $ptr, pattern, +: InstR600ISA outs, (ins MEMxi:$ptr), name, pattern, VTX_WORD1_GPR, VTX_WORD0 { // Static fields @@ -1838,7 +1837,7 @@ class VTX_READ_eg string name, bits8 buffer_id, dag outs, listdag pattern } class VTX_READ_8_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_8, buffer_id, (outs R600_TReg32_X:$dst), +: VTX_READ_eg VTX_READ_8 $dst, $ptr, buffer_id, (outs R600_TReg32_X:$dst), pattern { let MEGA_FETCH_COUNT = 1; @@ -1850,7 +1849,7 @@ class VTX_READ_8_eg bits8 buffer_id, listdag pattern } class VTX_READ_16_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_16, buffer_id, (outs R600_TReg32_X:$dst), +: VTX_READ_eg VTX_READ_16 $dst, $ptr, buffer_id, (outs R600_TReg32_X:$dst), pattern { let MEGA_FETCH_COUNT = 2; let DST_SEL_X = 0; @@ -1862,7 +1861,7 @@ class VTX_READ_16_eg bits8 buffer_id, listdag pattern } class VTX_READ_32_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_32, buffer_id, (outs R600_TReg32_X:$dst), +: VTX_READ_eg VTX_READ_32 $dst, $ptr, buffer_id, (outs R600_TReg32_X:$dst), pattern { let MEGA_FETCH_COUNT = 4; @@ -1883,7 +1882,7 @@ class VTX_READ_32_eg bits8 buffer_id, listdag pattern } class VTX_READ_128_eg bits8 buffer_id, listdag pattern -: VTX_READ_eg VTX_READ_128, buffer_id, (outs R600_Reg128:$dst), +: VTX_READ_eg VTX_READ_128 $dst.XYZW, $ptr, buffer_id, (outs R600_Reg128:$dst), pattern { let MEGA_FETCH_COUNT = 16; diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index bfc546b..df6004b 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -35,7 +35,7 @@ foreach Index = 0-127 in { Chan; } // 128-bit Temporary Registers - def T#Index#_XYZW : R600Reg_128 T#Index#.XYZW, + def T#Index#_XYZW : R600Reg_128 T#Index#, [!castRegister(T#Index#_X), !castRegister(T#Index#_Y), !castRegister(T#Index#_Z), -- 1.8.2.1 From 6840d3e3995283e98cd535db36ba24364f690072 Mon
[Mesa-dev] R600: Expand vselect and SRA for v2i32 and v4i32 (v2)
These two patches fix a number of piglit OpenCL test failures on my HD6850 (Barts). There are no piglit CL test regressions and the llvm make check runs without any unexpected failures. v2: Add tests for v4i32 data type. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patchset: Emit true ISA
On Sat, May 04, 2013 at 09:09:25AM -0700, Vincent Lejeune wrote: Hi, Thank for doing this. Patches 1 2 and 3 have my rb. For patch 4: Hi Vincent, Attached is an updated version of patch 4. -Tom @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isFCOp(MI.getOpcode())){ -EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, return; } else { switch(MI.getOpcode()) { -case AMDGPU::RAT_WRITE_CACHELESS_32_eg: -case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - Emit(inst, OS); - break; -} case AMDGPU::CONSTANT_LOAD_eg: case AMDGPU::VTX_READ_PARAM_8_eg: case AMDGPU::VTX_READ_PARAM_16_eg: Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST instead and to remove the switch() statement ? @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit((u_int32_t) 0, OS); break; } - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Samedi 4 mai 2013 0h53 Objet : R600 Patchset: Emit true ISA Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits From 385d53cb2324e59ae91f1b632c789183e658c335 Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stell...@amd.com Date: Fri, 3 May 2013 15:27:23 -0700 Subject: [PATCH] R600: Remove dead code from the CodeEmitter v2 v2: - Replace switch statement with TSFlags query --- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 456 +++-- lib/Target/R600/R600Defines.h | 3 + lib/Target/R600/R600InstrInfo.cpp | 5 +- 3 files changed, 64 insertions(+), 400 deletions(-) diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 10dee20..271a974 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -26,9 +26,6 @@ #include llvm/Support/raw_ostream.h #include stdio.h -#define SRC_BYTE_COUNT 11 -#define DST_BYTE_COUNT 5 - using namespace llvm; namespace { @@ -56,30 +53,14 @@ public: SmallVectorImplMCFixup Fixups) const; private: - void EmitALUInstr(const MCInst MI, SmallVectorImplMCFixup Fixups, -raw_ostream OS) const; - void EmitSrc(const MCInst MI, unsigned OpIdx, raw_ostream OS) const; - void EmitSrcISA(const MCInst MI, unsigned RegOpIdx, unsigned SelOpIdx, -raw_ostream OS) const; - void EmitDst(const MCInst MI, raw_ostream OS) const; - void EmitFCInstr(const MCInst MI, raw_ostream OS) const; - - void EmitNullBytes(unsigned int byteCount, raw_ostream OS) const; - void EmitByte(unsigned int byte, raw_ostream OS) const; - void EmitTwoBytes(uint32_t bytes, raw_ostream OS) const; - void Emit(uint32_t value, raw_ostream OS) const; void Emit(uint64_t value, raw_ostream OS) const; unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; - bool isFCOp(unsigned opcode) const; - bool isTexOp(unsigned opcode) const; - bool isFlagSet(const MCInst MI, unsigned Operand, unsigned Flag) const; - }; } // End anonymous namespace @@ -125,344 +106,82 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isFCOp(MI.getOpcode())){ -EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + const MCInstrDesc Desc = MCII.get(MI.getOpcode()); + if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || MI.getOpcode() == AMDGPU::KILL) { return; - } else { -
Re: [Mesa-dev] R600 Patchset: Emit true ISA
Reviewed-by:Vincent Lejeunevljn at ovi.com - Mail original - De : Tom Stellard t...@stellard.net À : Vincent Lejeune v...@ovi.com Cc : llvm-comm...@cs.uiuc.edu llvm-comm...@cs.uiuc.edu; mesa-dev@lists.freedesktop.org mesa-dev@lists.freedesktop.org Envoyé le : Lundi 6 mai 2013 17h02 Objet : Re: R600 Patchset: Emit true ISA On Sat, May 04, 2013 at 09:09:25AM -0700, Vincent Lejeune wrote: Hi, Thank for doing this. Patches 1 2 and 3 have my rb. For patch 4: Hi Vincent, Attached is an updated version of patch 4. -Tom @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isFCOp(MI.getOpcode())){ - EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, return; } else { switch(MI.getOpcode()) { - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - Emit(inst, OS); - break; - } case AMDGPU::CONSTANT_LOAD_eg: case AMDGPU::VTX_READ_PARAM_8_eg: case AMDGPU::VTX_READ_PARAM_16_eg: Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST instead and to remove the switch() statement ? @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit((u_int32_t) 0, OS); break; } - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Samedi 4 mai 2013 0h53 Objet : R600 Patchset: Emit true ISA Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600: Expand vselect and SRA for v2i32 and v4i32
These two patches fix a number of piglit OpenCL test failures on my HD6850 (Barts). There are no piglit CL test regressions and the llvm make check runs without any unexpected failures. ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600: Expand vselect and SRA for v2i32 and v4i32
On Mon, May 06, 2013 at 07:35:42PM -0500, Aaron Watry wrote: These two patches fix a number of piglit OpenCL test failures on my HD6850 (Barts). There are no piglit CL test regressions and the llvm make check runs without any unexpected failures. Hi Aaron, These patches look good to me, but you should also add some lit tests for these opcodes. For R600, the tests are located in test/CodeGen/R600. A good example to look at is the srl.ll test, the sra.ll test should be more or less the same. fcmp-cnd.ll would be a good example for the vselect.ll test as a vector version of it should generate vselect nodes (though you'll want to make sure to use i32 types instead of floats). -Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patchset: Emit true ISA
Hi, Thank for doing this. Patches 1 2 and 3 have my rb. For patch 4: @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo MCII, void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SmallVectorImplMCFixup Fixups) const { - if (isFCOp(MI.getOpcode())){ -EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + if (MI.getOpcode() == AMDGPU::RETURN || MI.getOpcode() == AMDGPU::FETCH_CLAUSE || MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, return; } else { switch(MI.getOpcode()) { -case AMDGPU::RAT_WRITE_CACHELESS_32_eg: -case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - Emit(inst, OS); - break; -} case AMDGPU::CONSTANT_LOAD_eg: case AMDGPU::VTX_READ_PARAM_8_eg: case AMDGPU::VTX_READ_PARAM_16_eg: Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST instead and to remove the switch() statement ? @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, Emit((u_int32_t) 0, OS); break; } - Mail original - De : Tom Stellard t...@stellard.net À : llvm-comm...@cs.uiuc.edu Cc : mesa-dev@lists.freedesktop.org Envoyé le : Samedi 4 mai 2013 0h53 Objet : R600 Patchset: Emit true ISA Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom ___ llvm-commits mailing list llvm-comm...@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] R600 Patchset: Emit true ISA
This series, and the associated mesa changes are all: Tested-By: Aaron Watry awa...@gmail.com --Aaron On Fri, May 3, 2013 at 5:53 PM, Tom Stellard t...@stellard.net wrote: Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 sb test results
On Fri, 03 May 2013 01:27:27 +0400 Vadim Girlin vadimgir...@gmail.com wrote: I'm almost sure that the same issue that you have with glxgears affects your app too, so you might want to wait until we resolve the problem with gears, possibly this will solve other rendering issues as well. ... By the way, I won't be very surprised if some old gcc release simply fails at handling bitfields which are used to store both the keys of shader variants in r600g and bytecode data in r600-sb (the same data that ends up being broken in your glxgears dump), IIRC there were bitfields-related bugs. It's not a bug, but undefined behavior AFAIK. I sent a patch that fixes the constant rebuilds (r600g: Correctly initialize the shader key). With these currently pending patches applied, I get no more visual distortion, and the fps improves a bit (28 - 32). Just in case it'd be useful to you, here's the current sb,vs,ps output: http://bayfiles.net/file/PnH3/9BRcGY/foo_shaders.gz - Lauri ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600 Patchset: Emit true ISA
Hi, The attached patches modify the CodeEmitter to emit true ISA. Previously, we were prefixing all instructions with an instruction type byte. Vincent did most of the work to convert the CodeEmitter to true ISA, these patches are just the last few cleanups that are needed to finish the project. Please test/review. Thanks, Tom From 4fc6af0637de0eae0542a987e93d467bad3a4eee Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stell...@amd.com Date: Fri, 3 May 2013 11:17:18 -0700 Subject: [PATCH 1/4] R600: Emit ISA for CALL_FS_* instructions --- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 1 - test/CodeGen/R600/call_fs.ll | 15 +++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/R600/call_fs.ll diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 7c83d86..8261477 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -262,7 +262,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, } case AMDGPU::CF_CALL_FS_EG: case AMDGPU::CF_CALL_FS_R600: - return; case AMDGPU::CF_TC_EG: case AMDGPU::CF_VC_EG: case AMDGPU::CF_TC_R600: diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll new file mode 100644 index 000..fd21b72 --- /dev/null +++ b/test/CodeGen/R600/call_fs.ll @@ -0,0 +1,15 @@ + +; RUN: llc %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG-CHECK %s +; RUN: llc %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600-CHECK %s + +; EG-CHECK: @call_fs +; EG-CHECK: CALL_FS ; encoding: [0x03,0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84] +; R600-CHECK: @call_fs +; R600-CHECK:CALL_FS ; encoding: [0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89] + + +define void @call_fs() #0 { + ret void +} + +attributes #0 = { ShaderType=1 } ; Vertex Shader -- 1.7.11.4 From 24b983199b5c81eb0362f4c583eafb842255acc5 Mon Sep 17 00:00:00 2001 From: Tom Stellard thomas.stell...@amd.com Date: Fri, 3 May 2013 11:38:33 -0700 Subject: [PATCH 2/4] R600: Stop emitting the instruction type byte before each instruction --- lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 35 ++ test/CodeGen/R600/call_fs.ll | 4 +-- test/CodeGen/R600/cf_end.ll| 4 +-- 3 files changed, 6 insertions(+), 37 deletions(-) diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 8261477..10dee20 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -9,12 +9,8 @@ // /// \file /// -/// This code emitter outputs bytecode that is understood by the r600g driver -/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA, -/// but it still needs to be run through a finalizer in order to be executed -/// by the GPU. -/// -/// [1] http://www.mesa3d.org/ +/// \brief The R600 code emitter produces machine code that can be executed +/// directly on the GPU device. // //===--===// @@ -95,16 +91,6 @@ enum RegElement { ELEMENT_W }; -enum InstrTypes { - INSTR_ALU = 0, - INSTR_TEX, - INSTR_FC, - INSTR_NATIVE, - INSTR_VTX, - INSTR_EXPORT, - INSTR_CFALU -}; - enum FCInstr { FC_IF_PREDICATE = 0, FC_ELSE, @@ -152,7 +138,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_NATIVE, OS); Emit(inst, OS); break; } @@ -170,9 +155,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset InstWord2 |= 1 19; - EmitByte(INSTR_NATIVE, OS); Emit(InstWord01, OS); - EmitByte(INSTR_NATIVE, OS); Emit(InstWord2, OS); Emit((u_int32_t) 0, OS); break; @@ -246,9 +229,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, SrcSelect[ELEMENT_W] 29 | Offsets[0] 0 | Offsets[1] 5 | Offsets[2] 10; - EmitByte(INSTR_NATIVE, OS); Emit(Word01, OS); - EmitByte(INSTR_NATIVE, OS); Emit(Word2, OS); Emit((u_int32_t) 0, OS); break; @@ -256,7 +237,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, case AMDGPU::CF_ALU: case AMDGPU::CF_ALU_PUSH_BEFORE: { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_NATIVE, OS); Emit(Inst, OS); break; } @@ -289,13 +269,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst
Re: [Mesa-dev] r600 sb test results
On Thu, 02 May 2013 00:45:13 +0400 Vadim Girlin vadimgir...@gmail.com wrote: On 05/01/2013 11:36 PM, Lauri Kasanen wrote: Now that it built, I could test your optimizations in my own apps. These are on current master 8eef6ad, on a RV710 (HD 4350 pci-e). In one of my private apps, using R600_DEBUG=sb caused regressions: FPS went from 28 to 7, the SSAO shader gave visual distortions/flicker, and the cpu was constantly pegged. Here's the output from R600_DEBUG=sb,sbstat in case it helps: http://bayfiles.net/file/Pmkh/PUj0Ru/vadim.gz It seems as if it's constantly handling new shaders? My app certainly issues no new shaders, they are all linked when the app starts. r600g may rebuild shaders at runtime because some GL features are implemented in shader code, so if your app changes some specific GL states (e.g. two-sided rendering mode), then r600g has to build and switch between different shader variants. It mainly uses the stencil buffer, the clear color is changed in various passes, some occlusion queries with color masks, but nothing exotic. New uniforms are of course sent each frame. On the other hand there is caching of shader variants in r600g implemented specially to prevent repetitive rebuilding of shaders, looks like it doesn't work in your case for some reason. Optimizations take more time than rebuilding with default backend, that explains performance regression. Could you provide some test app that reproduces these issues? It's quite time-taking to cut it down, and apitraces of it in full are several gigs (far too much to upload with my connection). I'll see if I can get just the SSAO isolated, with minimal textures, to get a smaller trace. Please also send me the dump with R600_DEBUG=sb,ps,vs, maybe I'll be able to spot anything wrong there. http://bayfiles.net/file/PmY5/xgIdlZ/foo.gz Let me know what you need to debug this. - Lauri PS: I'm not sure if this should be public or not, I think you're the only one working on it? Yes, I doubt that anyone else will work on it, on the other hand I think reporting this on the list might help other users who will possibly hit similar issues. Also at least in this case it looks rather like a problem in r600g, so I'm cc'ing mesa-dev, r600-sb just made this issue more noticeable because shader rebuilding with optimization requires more time. Using standard r600g, the cpu usage is less than 25% of one core, so nothing was showing it was constantly rebuilding shaders. Is there some way I could've found it was doing that, and if so, why? - Lauri ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 sb test results
On 05/02/2013 06:34 PM, Lauri Kasanen wrote: On Thu, 02 May 2013 00:45:13 +0400 Vadim Girlin vadimgir...@gmail.com wrote: On 05/01/2013 11:36 PM, Lauri Kasanen wrote: Now that it built, I could test your optimizations in my own apps. These are on current master 8eef6ad, on a RV710 (HD 4350 pci-e). In one of my private apps, using R600_DEBUG=sb caused regressions: FPS went from 28 to 7, the SSAO shader gave visual distortions/flicker, and the cpu was constantly pegged. Here's the output from R600_DEBUG=sb,sbstat in case it helps: http://bayfiles.net/file/Pmkh/PUj0Ru/vadim.gz It seems as if it's constantly handling new shaders? My app certainly issues no new shaders, they are all linked when the app starts. r600g may rebuild shaders at runtime because some GL features are implemented in shader code, so if your app changes some specific GL states (e.g. two-sided rendering mode), then r600g has to build and switch between different shader variants. It mainly uses the stencil buffer, the clear color is changed in various passes, some occlusion queries with color masks, but nothing exotic. New uniforms are of course sent each frame. On the other hand there is caching of shader variants in r600g implemented specially to prevent repetitive rebuilding of shaders, looks like it doesn't work in your case for some reason. Optimizations take more time than rebuilding with default backend, that explains performance regression. Could you provide some test app that reproduces these issues? It's quite time-taking to cut it down, and apitraces of it in full are several gigs (far too much to upload with my connection). I'll see if I can get just the SSAO isolated, with minimal textures, to get a smaller trace. I'm almost sure that the same issue that you have with glxgears affects your app too, so you might want to wait until we resolve the problem with gears, possibly this will solve other rendering issues as well. Please also send me the dump with R600_DEBUG=sb,ps,vs, maybe I'll be able to spot anything wrong there. http://bayfiles.net/file/PmY5/xgIdlZ/foo.gz Let me know what you need to debug this. - Lauri PS: I'm not sure if this should be public or not, I think you're the only one working on it? Yes, I doubt that anyone else will work on it, on the other hand I think reporting this on the list might help other users who will possibly hit similar issues. Also at least in this case it looks rather like a problem in r600g, so I'm cc'ing mesa-dev, r600-sb just made this issue more noticeable because shader rebuilding with optimization requires more time. Using standard r600g, the cpu usage is less than 25% of one core, so nothing was showing it was constantly rebuilding shaders. Is there some way I could've found it was doing that, and if so, why? You could run the app with R600_DEBUG=ps,vs (without sb) - it will also print the dump of every built shader. r600-sb doesn't affect the logic of shader rebuilding, it just processes the shaders when asked by r600g, so I think you'll see the same - a lot of built shaders. You could even try this with older mesa (before r600-sb was merged) to be sure. As for the cause of rebuilding, I don't see any changes in the shaders in your dump that might be explained by state changes, it's exactly the same shaders rebuilt more than once, so far I don't know why. You might want to look into r600_shader_select function with debugger to see what's going wrong, it computes the key for required shader variant using r600_shader_selector_key, then looks at the list of variants to find already built shader with the same key, and builds a new one only if it can't find existing shader. Looks like something fails there. By the way, I won't be very surprised if some old gcc release simply fails at handling bitfields which are used to store both the keys of shader variants in r600g and bytecode data in r600-sb (the same data that ends up being broken in your glxgears dump), IIRC there were bitfields-related bugs. Vadim - Lauri ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600/sb binary constants
Hi list The recently added r600 sb backend fails to build on GCC 4.3, since it uses binary constants (0b0101). Is the GCC version dependency intentional, or should the constants be changed to int/hex? - Lauri ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] r600-sb: glxgears wrong rendering
Hi Running R600_DEBUG=sb glxgears on a RV710 gives wrong output: http://i40.tinypic.com/t7gx09.png This is on current master, git-8eef6ad. Let me know what you need to debug this. - Lauri ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600-sb: glxgears wrong rendering
On 05/01/2013 11:42 PM, Lauri Kasanen wrote: Hi Running R600_DEBUG=sb glxgears on a RV710 gives wrong output: http://i40.tinypic.com/t7gx09.png This is on current master, git-8eef6ad. Let me know what you need to debug this. Please send me the output with R600_DEBUG=sb,ps,vs Vadim ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] r600 sb test results
On 05/01/2013 11:36 PM, Lauri Kasanen wrote: Hi Vadim Now that it built, I could test your optimizations in my own apps. These are on current master 8eef6ad, on a RV710 (HD 4350 pci-e). In one of my private apps, using R600_DEBUG=sb caused regressions: FPS went from 28 to 7, the SSAO shader gave visual distortions/flicker, and the cpu was constantly pegged. Here's the output from R600_DEBUG=sb,sbstat in case it helps: http://bayfiles.net/file/Pmkh/PUj0Ru/vadim.gz It seems as if it's constantly handling new shaders? My app certainly issues no new shaders, they are all linked when the app starts. Hi, r600g may rebuild shaders at runtime because some GL features are implemented in shader code, so if your app changes some specific GL states (e.g. two-sided rendering mode), then r600g has to build and switch between different shader variants. On the other hand there is caching of shader variants in r600g implemented specially to prevent repetitive rebuilding of shaders, looks like it doesn't work in your case for some reason. Optimizations take more time than rebuilding with default backend, that explains performance regression. Could you provide some test app that reproduces these issues? Please also send me the dump with R600_DEBUG=sb,ps,vs, maybe I'll be able to spot anything wrong there. Let me know what you need to debug this. - Lauri PS: I'm not sure if this should be public or not, I think you're the only one working on it? Yes, I doubt that anyone else will work on it, on the other hand I think reporting this on the list might help other users who will possibly hit similar issues. Also at least in this case it looks rather like a problem in r600g, so I'm cc'ing mesa-dev, r600-sb just made this issue more noticeable because shader rebuilding with optimization requires more time. Vadim ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev
[Mesa-dev] R600 Patchset: Optimizations for bfgminer
Hi, The attached patchset implements a few optimizations for the bfgminer bitcoin mining program. Please Review. -Tom From 661e832408a8bafc03a7c4c600c4a140b03054b4 Mon Sep 17 00:00:00 2001 From: Dmitry Cherkassov dcherkas...@gmail.com Date: Thu, 7 Mar 2013 20:17:59 +0400 Subject: [PATCH 1/3] R600: Add 64-bit load/store support * Added R600_Reg64 class * Added T#Index#.XY registers definition * Added v2i32 register reads from parameter and global space * Added f32 and i32 elements extraction from v2f32 and v2i32 * Added v2i32 - v2f32 conversions Signed-off-by: Dmitry Cherkassov dcherkas...@gmail.com Tom Stellard: - Mark vec2 operations as expand. The addition of a vec2 register class made them all legal. --- lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++ lib/Target/R600/AMDILISelDAGToDAG.cpp | 10 - lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 3 ++ lib/Target/R600/R600ISelLowering.cpp | 17 + lib/Target/R600/R600InstrInfo.cpp | 19 ++ lib/Target/R600/R600Instructions.td| 44 ++ lib/Target/R600/R600RegisterInfo.td| 16 test/CodeGen/R600/64bit-kernel-args.ll | 41 test/CodeGen/R600/fadd.ll | 10 + test/CodeGen/R600/fdiv.ll | 37 +- test/CodeGen/R600/fmul.ll | 10 + test/CodeGen/R600/fp_to_sint.ll| 10 + test/CodeGen/R600/fp_to_uint.ll| 10 + test/CodeGen/R600/fsub.ll | 20 +++--- test/CodeGen/R600/setcc.ll | 18 +++-- test/CodeGen/R600/sint_to_fp.ll| 10 + test/CodeGen/R600/udiv.ll | 20 +++--- test/CodeGen/R600/uint_to_fp.ll| 10 + test/CodeGen/R600/urem.ll | 21 --- 19 files changed, 292 insertions(+), 40 deletions(-) create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index a266df5..4a064b1 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); + setOperationAction(ISD::STORE, MVT::v2f32, Promote); + AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f32, Promote); AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); @@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) : setOperationAction(ISD::LOAD, MVT::v4f32, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::UDIV, MVT::i32, Expand); diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index ba75a44..198cd7e 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -167,12 +167,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (ST.device()-getGeneration() AMDGPUDeviceInfo::HD6XXX) { break; } +unsigned RegSequenceClassID; +EVT VT = N-getValueType(0); +assert(VT.isVector()); +switch (VT.getVectorNumElements()) { +case 4: RegSequenceClassID = AMDGPU::R600_Reg128RegClassID; break; +case 2: RegSequenceClassID = AMDGPU::R600_Reg64RegClassID; break; +default: llvm_unreachable(Unhandled vector width in BUILD_VECTOR); +} // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. SDValue RegSeqArgs[9] = { - CurDAG-getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32), + CurDAG-getTargetConstant(RegSequenceClassID, MVT::i32), SDValue(), CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32), SDValue(), CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32), SDValue(), CurDAG-getTargetConstant(AMDGPU::sub2, MVT::i32), diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 7c83d86..030fc87 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -150,6 +150,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS, } else { switch(MI.getOpcode()) { case AMDGPU::RAT_WRITE_CACHELESS_32_eg: +