Re: [Mesa-dev] [r600/sfn] Compilation error (and some warnings) - maybe to old LLVM git version, here?

2020-11-28 Thread Gert Wollny
Am Samstag, den 28.11.2020, 02:46 +0100 schrieb Dieter Nützel:
> [48/179] Compiling C++ object 
> src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o
> FAILED: 
> src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o
> ccache c++ -Isrc/gallium/drivers/r600/libr600.a.p 
> -Isrc/gallium/drivers/r600 -I../src/gallium/drivers/r600 -Isrc
> -I../src 
> -Isrc/mapi -I../src/mapi -Isrc/mesa -I../src/mesa -Iinclude
> -I../include 
> -Isrc/compiler -I../src/compiler -I../src/gallium/include 
> -Isrc/gallium/auxiliary -I../src/gallium/auxiliary -Isrc/amd/common 
> -I../src/amd/common -Isrc/gallium/drivers -I../src/gallium/drivers 
> -Isrc/compiler/nir -I../src/compiler/nir -Isrc/util -I../src/util 
> -I/usr/include/libdrm -fvisibility=hidden -fdiagnostics-color=always 
> -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch 
> -Wnon-virtual-dtor -std=c++14 -O3 -ffunction-sections -fdata-
> sections 
> '-DPACKAGE_VERSION="21.0.0-devel"' 
> '-DPACKAGE_BUGREPORT="
> https://gitlab.freedesktop.org/mesa/mesa/-/issues;' 
> -DUSE_ELF_TLS -DHAVE_ST_VDPAU -DENABLE_ST_OMX_BELLAGIO=0 
> -DENABLE_ST_OMX_TIZONIA=0 -DHAVE_X11_PLATFORM -DHAVE_XCB_PLATFORM 
> -DGLX_INDIRECT_RENDERING -DGLX_DIRECT_RENDERING -DGLX_USE_DRM 
> -DHAVE_DRM_PLATFORM -DENABLE_SHADER_CACHE -DHAVE___BUILTIN_BSWAP32 
> -DHAVE___BUILTIN_BSWAP64 -DHAVE___BUILTIN_CLZ -DHAVE___BUILTIN_CLZLL 
> -DHAVE___BUILTIN_CTZ -DHAVE___BUILTIN_EXPECT -DHAVE___BUILTIN_FFS 
> -DHAVE___BUILTIN_FFSLL -DHAVE___BUILTIN_POPCOUNT 
> -DHAVE___BUILTIN_POPCOUNTLL -DHAVE___BUILTIN_UNREACHABLE 
> -DHAVE_FUNC_ATTRIBUTE_CONST -DHAVE_FUNC_ATTRIBUTE_FLATTEN 
> -DHAVE_FUNC_ATTRIBUTE_MALLOC -DHAVE_FUNC_ATTRIBUTE_PURE 
> -DHAVE_FUNC_ATTRIBUTE_UNUSED
> -DHAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT 
> -DHAVE_FUNC_ATTRIBUTE_WEAK -DHAVE_FUNC_ATTRIBUTE_FORMAT 
> -DHAVE_FUNC_ATTRIBUTE_PACKED -DHAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL 
> -DHAVE_FUNC_ATTRIBUTE_ALIAS -DHAVE_FUNC_ATTRIBUTE_NORETURN 
> -DHAVE_FUNC_ATTRIBUTE_VISIBILITY -DHAVE_UINT128 -DUSE_SSE41 
> -DUSE_GCC_ATOMIC_BUILTINS -DUSE_X86_64_ASM -DMAJOR_IN_SYSMACROS 
> -DHAVE_LINUX_FUTEX_H -DHAVE_ENDIAN_H -DHAVE_DLFCN_H
> -DHAVE_EXECINFO_H 
> -DHAVE_SYS_SHM_H -DHAVE_CET_H -DHAVE_STRTOF -DHAVE_MKOSTEMP 
> -DHAVE_TIMESPEC_GET -DHAVE_MEMFD_CREATE -DHAVE_RANDOM_R -DHAVE_FLOCK 
> -DHAVE_STRTOK_R -DHAVE_GETRANDOM -DHAVE_PROGRAM_INVOCATION_NAME 
> -DHAVE_POSIX_MEMALIGN -DHAVE_DIRENT_D_TYPE -DHAVE_STRTOD_L
> -DHAVE_DLADDR 
> -DHAVE_DL_ITERATE_PHDR -DHAVE_ZLIB -DHAVE_ZSTD -DHAVE_PTHREAD 
> -DHAVE_PTHREAD_SETAFFINITY -DHAVE_LIBDRM -DLLVM_AVAILABLE 
> '-DMESA_LLVM_VERSION_STRING="12.0.0"' -DLLVM_IS_SHARED=1 
> -DUSE_LIBGLVND=1 -DHAVE_LIBUNWIND -DHAVE_DRI3 -DHAVE_DRI3_MODIFIERS 
> -DHAVE_LIBSENSORS=1 -Werror=return-type -Werror=empty-body 
> -Wno-non-virtual-dtor -Wno-missing-field-initializers 
> -Wno-format-truncation -fno-math-errno -fno-trapping-math 
> -flifetime-dse=1 -Werror=format -Wformat-security -fPIC -pthread 
> -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -D__STDC_LIMIT_MACROS 
> -D__STDC_CONSTANT_MACROS -MD -MQ 
> src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o
> -MF 
> src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o.d
> -o 
> src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o
> -c 
> ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp
> ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp: In function 
> ‘unsigned int r600::barycentric_ij_index(nir_intrinsic_instr*)’:
> ../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp:102:4:
> error: 
> control reaches end of non-void function [-Werror=return-type]
>102 |case INTERP_MODE_FLAT:
>|^~~~
There is an "assert" there that should be an "unreachable", patch
coming up.  

Thanks for testing. 
Gert 


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [r600/sfn] Compilation error (and some warnings) - maybe to old LLVM git version, here?

2020-11-27 Thread Dieter Nützel
[48/179] Compiling C++ object 
src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o
FAILED: 
src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o
ccache c++ -Isrc/gallium/drivers/r600/libr600.a.p 
-Isrc/gallium/drivers/r600 -I../src/gallium/drivers/r600 -Isrc -I../src 
-Isrc/mapi -I../src/mapi -Isrc/mesa -I../src/mesa -Iinclude -I../include 
-Isrc/compiler -I../src/compiler -I../src/gallium/include 
-Isrc/gallium/auxiliary -I../src/gallium/auxiliary -Isrc/amd/common 
-I../src/amd/common -Isrc/gallium/drivers -I../src/gallium/drivers 
-Isrc/compiler/nir -I../src/compiler/nir -Isrc/util -I../src/util 
-I/usr/include/libdrm -fvisibility=hidden -fdiagnostics-color=always 
-DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch 
-Wnon-virtual-dtor -std=c++14 -O3 -ffunction-sections -fdata-sections 
'-DPACKAGE_VERSION="21.0.0-devel"' 
'-DPACKAGE_BUGREPORT="https://gitlab.freedesktop.org/mesa/mesa/-/issues;' 
-DUSE_ELF_TLS -DHAVE_ST_VDPAU -DENABLE_ST_OMX_BELLAGIO=0 
-DENABLE_ST_OMX_TIZONIA=0 -DHAVE_X11_PLATFORM -DHAVE_XCB_PLATFORM 
-DGLX_INDIRECT_RENDERING -DGLX_DIRECT_RENDERING -DGLX_USE_DRM 
-DHAVE_DRM_PLATFORM -DENABLE_SHADER_CACHE -DHAVE___BUILTIN_BSWAP32 
-DHAVE___BUILTIN_BSWAP64 -DHAVE___BUILTIN_CLZ -DHAVE___BUILTIN_CLZLL 
-DHAVE___BUILTIN_CTZ -DHAVE___BUILTIN_EXPECT -DHAVE___BUILTIN_FFS 
-DHAVE___BUILTIN_FFSLL -DHAVE___BUILTIN_POPCOUNT 
-DHAVE___BUILTIN_POPCOUNTLL -DHAVE___BUILTIN_UNREACHABLE 
-DHAVE_FUNC_ATTRIBUTE_CONST -DHAVE_FUNC_ATTRIBUTE_FLATTEN 
-DHAVE_FUNC_ATTRIBUTE_MALLOC -DHAVE_FUNC_ATTRIBUTE_PURE 
-DHAVE_FUNC_ATTRIBUTE_UNUSED -DHAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT 
-DHAVE_FUNC_ATTRIBUTE_WEAK -DHAVE_FUNC_ATTRIBUTE_FORMAT 
-DHAVE_FUNC_ATTRIBUTE_PACKED -DHAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL 
-DHAVE_FUNC_ATTRIBUTE_ALIAS -DHAVE_FUNC_ATTRIBUTE_NORETURN 
-DHAVE_FUNC_ATTRIBUTE_VISIBILITY -DHAVE_UINT128 -DUSE_SSE41 
-DUSE_GCC_ATOMIC_BUILTINS -DUSE_X86_64_ASM -DMAJOR_IN_SYSMACROS 
-DHAVE_LINUX_FUTEX_H -DHAVE_ENDIAN_H -DHAVE_DLFCN_H -DHAVE_EXECINFO_H 
-DHAVE_SYS_SHM_H -DHAVE_CET_H -DHAVE_STRTOF -DHAVE_MKOSTEMP 
-DHAVE_TIMESPEC_GET -DHAVE_MEMFD_CREATE -DHAVE_RANDOM_R -DHAVE_FLOCK 
-DHAVE_STRTOK_R -DHAVE_GETRANDOM -DHAVE_PROGRAM_INVOCATION_NAME 
-DHAVE_POSIX_MEMALIGN -DHAVE_DIRENT_D_TYPE -DHAVE_STRTOD_L -DHAVE_DLADDR 
-DHAVE_DL_ITERATE_PHDR -DHAVE_ZLIB -DHAVE_ZSTD -DHAVE_PTHREAD 
-DHAVE_PTHREAD_SETAFFINITY -DHAVE_LIBDRM -DLLVM_AVAILABLE 
'-DMESA_LLVM_VERSION_STRING="12.0.0"' -DLLVM_IS_SHARED=1 
-DUSE_LIBGLVND=1 -DHAVE_LIBUNWIND -DHAVE_DRI3 -DHAVE_DRI3_MODIFIERS 
-DHAVE_LIBSENSORS=1 -Werror=return-type -Werror=empty-body 
-Wno-non-virtual-dtor -Wno-missing-field-initializers 
-Wno-format-truncation -fno-math-errno -fno-trapping-math 
-flifetime-dse=1 -Werror=format -Wformat-security -fPIC -pthread 
-D__STDC_FORMAT_MACROS -D_GNU_SOURCE -D__STDC_LIMIT_MACROS 
-D__STDC_CONSTANT_MACROS -MD -MQ 
src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o -MF 
src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o.d -o 
src/gallium/drivers/r600/libr600.a.p/sfn_sfn_shader_fragment.cpp.o -c 
../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp
../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp: In function 
‘unsigned int r600::barycentric_ij_index(nir_intrinsic_instr*)’:
../src/gallium/drivers/r600/sfn/sfn_shader_fragment.cpp:102:4: error: 
control reaches end of non-void function [-Werror=return-type]

  102 |case INTERP_MODE_FLAT:
  |^~~~
cc1plus: some warnings being treated as errors
[56/179] Compiling C++ object 
src/gallium/frontends/clover/libclllvm.a.p/llvm_codegen_common.cpp.o
In file included from 
../src/gallium/frontends/clover/llvm/codegen/common.cpp:34:
../src/gallium/frontends/clover/llvm/metadata.hpp: In function 
‘std::string clover::llvm::get_type_kernel_metadata(const 
llvm::Function&, const string&)’:
../src/gallium/frontends/clover/llvm/metadata.hpp:132:86: warning: 
‘unsigned int llvm::VectorType::getNumElements() const’ is deprecated 
[-Wdeprecated-declarations]
  132 |   data += 
std::to_string(((::llvm::VectorType*)type)->getNumElements());
  |  
^

In file included from /usr/local/include/llvm/IR/DataLayout.h:26,
 from /usr/local/include/llvm/IR/Module.h:25,
 from 
../src/gallium/frontends/clover/llvm/codegen.hpp:35,
 from 
../src/gallium/frontends/clover/llvm/codegen/common.cpp:33:

/usr/local/include/llvm/IR/DerivedTypes.h:534:10: note: declared here
  534 | unsigned VectorType::getNumElements() const {
  |  ^~
[57/179] Compiling C++ object 
src/gallium/frontends/clover/libclllvm.a.p/llvm_invocation.cpp.o
In file included from 
../src/gallium/frontends/clover/llvm/invocation.cpp:55:
../src/gallium/frontends/clover/llvm/metadata.hpp: In function 
‘std::string clover::llvm::get_type_kernel_metadata(const 
llvm::Function&, const string&)’:

Re: [Mesa-dev] r600

2019-11-27 Thread Gert Wollny
Am Donnerstag, den 28.11.2019, 13:22 +1000 schrieb Dave Airlie:
> On Wed, 27 Nov 2019 at 21:08, Gert Wollny 
> wrote:
> > 
> > Before that I'd like to un-tabbify the whole r600 driver code,
> > because all the other parts of mesa I've been touching use spaces,
> > and it makes it more convenient to have the same WS handling
> > everywhere.
> > 
> I'm not against it from a style point of view, but from a it totally
> breaks git history, blame, cherry-picking and many other useful
> things I'd really ask you to reconsider and just use editorconfig
> 
Fair enough, I'll keep the tabs for now and see whether I can get
editorconfig to work for me. 

Best, 
Gert 

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] r600

2019-11-27 Thread Dave Airlie
On Wed, 27 Nov 2019 at 21:08, Gert Wollny  wrote:
>
> Hello Dave,
>
> I was wondering how much interest you still have in R600? I'm preparing
> to start feeding my NIR work as MRs to continue my work in-tree. It is
> currently only for Evergreen and still far from feature parity
> with TGSI (no tesselation, no images, nor SSBOs), some things regress,
> but some things are also fixed, so obviously the backend will only be
> enabled on explicit request.
>
> Before that I'd like to un-tabbify the whole r600 driver code, because
> all the other parts of mesa I've been touching use spaces, and it makes
> it more convenient to have the same WS handling everywhere.
>
> Whould this be okay with you?

I'm not against it from a style point of view, but from a it totally
breaks git history, blame, cherry-picking and many other useful things
I'd really ask you to reconsider and just use editorconfig. Maybe r600
is quiet enough now we don't have to worry about that much backports
or cherry-pick, so I'll leave it up to you.

At least leave sb alone since we probably want to kill that someday if
your NIR backend gets there, and I doubt you want to touch it too much
either.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] r600

2019-11-27 Thread Gert Wollny
Hello Dave, 

I was wondering how much interest you still have in R600? I'm preparing
to start feeding my NIR work as MRs to continue my work in-tree. It is
currently only for Evergreen and still far from feature parity
with TGSI (no tesselation, no images, nor SSBOs), some things regress,
but some things are also fixed, so obviously the backend will only be
enabled on explicit request. 

Before that I'd like to un-tabbify the whole r600 driver code, because
all the other parts of mesa I've been touching use spaces, and it makes
it more convenient to have the same WS handling everywhere.

Whould this be okay with you?

Best, 
Gert




___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] r600/eg : ARB_query_buffer_object initial support.

2018-01-24 Thread Dave Airlie
This passes the arb_query_buffer-object-qbo test in piglit,
the coherent test is a bit less successful but some of that is
lacking support for indirect compute anyways.

I'm not going to enable GL4.5, as we haven't got CTS coverage
yet, but this is one of the last bits towards GL4.5 on cayman.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 sb tessellation support

2018-01-11 Thread Gert Wollny
Hi Dave, 

Am Mittwoch, den 10.01.2018, 16:48 +1000 schrieb Dave Airlie:
> This is an attempt to add tessellation support to the SB backend.
> 

I tried to dig a bit more in the failing piglits, specifically 
"1in-1out" that passed with your WIP branch form Jan/9. 

Now, with sb it fails by drawing some pseudo-random patterns that would
indicate that for some of the patches uninitialized memory is accessed
when drawing the upper left sub-quad of each patch. The pattern may
change when pressing some key.  
However, by inspecting disassembly I was not able to detect any change
in the LDS addresses accessed by the optimized shaders vs. the original
ones, only the access order is (mostly) reversed (which is probably the
most notable change w.r.t. the WIP branch). 

When I change the tess-factors to 2.0 (instead of 3.0) and also the
related values in the vertex and tes shader, the test passes, which
should confirm that the LDS access is correct. 

Do I change the 3 only in tes to a positive value below 2.5 then the
drawn pattern doesn't show random behaviour. Also if in the assignment
to the input[].z value in the vertex shader the 64 is replaced by 64.0,
making the evaluation there a floating point operation, then the random
patterns in the piglit screen output disappear. Since this z-value is
not used for the coordinate evaluation it should have no influence on
these patters.

Best, 
Gert 


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 sb tessellation support

2018-01-10 Thread Gert Wollny
Am Mittwoch, den 10.01.2018, 16:48 +1000 schrieb Dave Airlie:
> This is an attempt to add tessellation support to the SB backend.
> 
> The main things needed are GDS access which is used for tess
> factor storage (also used for atomic counters), and LDS access
> which is needed to pass all the data between stages.
> 
> The first 19 patches are the stuff I'm happy with, the
> nop/sanity shader tests pass with those (and sb enabled).
> 
> The last two patches make heaven work and turn on sb,
> I'm not suggested these be applied as-is yet.

Compared to yesterdays 42 regressions nosb versus sb it's now down to
five regressions within -t tessellation:

execution 
   1in-1out 

   tes-input/tes-input-gl_clipdistance
   tes-input/tes-input-patch-mat2x4_2
   tes-input/tes-input-patch-mat3x4_2
   tes-input/tes-input-patch-mat4

and one failure became a crash in 

trivial-tess-gs_no-gs-inputs

but the crash can easily be alleviated with a patch I send out later.
(I tried to  send it yesterday, but noted today that something had gone
wrong, i.e. I did send an empty email instead). 

Some numbers on 6870HD:
Heaven 1280x1024 
Quality: High, Tesselation: Normal, Anti-Aliasing: Off
FPS: 18.5 [5.7, 70] (was: 12.6 [4.0, 61] before any optimization). 

Tessmark x32, 1024x640 
FPS: 45, 2153 points (was 10, 635)

With all this: Tested-By: Gert Wollny  

> I think in theory enabling sb for atomics/images/compute should
> be fine after this series as well, but I haven't tested that too
> much.
I'll check this out. 

Many thanks for your work on this, 
Gert

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600 sb tessellation support

2018-01-09 Thread Dave Airlie
This is an attempt to add tessellation support to the SB backend.

The main things needed are GDS access which is used for tess
factor storage (also used for atomic counters), and LDS access
which is needed to pass all the data between stages.

The first 19 patches are the stuff I'm happy with, the
nop/sanity shader tests pass with those (and sb enabled).

The last two patches make heaven work and turn on sb,
I'm not suggested these be applied as-is yet.

I think in theory enabling sb for atomics/images/compute should
be fine after this series as well, but I haven't tested that too
much.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600 ssbo/image fixes

2017-12-05 Thread Dave Airlie
I've been running deqp-gles31 over the r600 ssbo/image code
it uses compute shaders, but I've found a few bugs in the in-tree
code, so just sending some fixes out for those first.

ssbo seems to pass all the tests, images have some heisenbug
where they pass sometimes and not others.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600: cayman atomic gds support

2017-11-30 Thread Dave Airlie
There appears to be some bad interaction with the append/consume counters
on cayman (and compute shaders at least). I traced fglrx and it appears
it directly uses GDS memory.

This adds cayman specific paths to directly use GDS memory for these
atomics.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 evergreen+ shader image support

2017-11-15 Thread Gert Wollny
Am Mittwoch, den 15.11.2017, 10:11 +1000 schrieb Dave Airlie:
> 
> It's not 100% on piglits, but it's quite close, and better than fglrx
> does, so I'd probably prefer to land it before doing too much more
> destructive hacking on it!

I ran the piglits shader set on barts - no regressions, and all the
newly tested piglits pass, i.e. 

   basic-imagestore-const-uniform-index
   basic-imagestore-mixed-const-non-const-uniform-index
   basic-imagestore-mixed-const-non-const-uniform-index2
   basic-imagestore-non-const-uniform-index
   arb_shader_image_load_store
   basic-imagestore-from-uniform
   disable_early_z
   image_checkerboard
   load-from-cleared-image
   write-to-rendered-image
   arb_shading_language_420pack
   different-bindings-image2d
  

For the series: 

  Tested-By: Gert Wollny  

Best, 
Gert 

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600 evergreen+ shader image support

2017-11-14 Thread Dave Airlie
I've been hacking on this on/off for quite a while now, and I think
I'm finally happy with where is has reached.

It's not 100% on piglits, but it's quite close, and better than fglrx
does, so I'd probably prefer to land it before doing too much more
destructive hacking on it!

If you have a cayman, you now get GL4.2.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600: some misc assembler and state updates

2017-11-02 Thread Nicolai Hähnle

For the series:

Reviewed-by: Nicolai Hähnle 

On 01.11.2017 00:32, Dave Airlie wrote:

These are just some misc patches from the road to GL4.3 patches,
They don't do anything on their own, just cleanly improve the assembler
some state setting.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev




--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600: some misc assembler and state updates

2017-10-31 Thread Dave Airlie
These are just some misc patches from the road to GL4.3 patches,
They don't do anything on their own, just cleanly improve the assembler
some state setting.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/AMDGPU fixes for Clover

2017-06-17 Thread Jan Vesely
On Fri, 2017-06-16 at 12:48 +0100, Emil Velikov wrote:
> On 15 June 2017 at 14:03, Aaron Watry  wrote:
> > Hey all,
> > 
> > We haven't landed the fixes to break the r600g dependency on AMDGPU yet.
> > I'm headed out of town for a long weekend and don't feel like risking the
> > push before being gone for five days.
> > 
> > I've got a v3 of Emil's patch 4/5 that removes the AMDGPU header dependency
> > from r600 and I'm good with the status of Jan's 3-patch series.  I'm hoping
> > we can square that away early next week unless is gets resolved while I'm
> > gone.
> > 
> 
> I've double-checked and Jan's 1-3 (squashed 2+3) alongside my 4-5
> resolve all the issues I could notice.
> Pushed the lot and I'll parse through patchwork in a moment.

thanks for keeping tabs on this. I've been mostly sick/travelling past
2 weeks. I'll be fully online starting Wednesday.

The third patch of my series was separate, because it needed a bit more
baking (I posted it to shift the discussion away from deletion), but I
guess the few follow up bugs hit all the rough edges.

libelf is only needed for with-opencl configurations (so android should
be OK without this dep. see my response to Mauro). I'll post a patch
that fixes automake this week.

d5199c (Revert "amd/common: add missing libdrm include path") looks
incorrect. libamd_common still includes amdgpu.h so I'd say it needs
the CFLAGS. Moving the include to ac_gpu_info.c (in 81945) is enough to
remove libdrm_amdgpu dependency from r600g builds.

thanks,
Jan
> 
> Thanks
> Emil

-- 
Jan Vesely 

signature.asc
Description: This is a digitally signed message part
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/AMDGPU fixes for Clover

2017-06-16 Thread Emil Velikov
On 15 June 2017 at 14:03, Aaron Watry  wrote:
> Hey all,
>
> We haven't landed the fixes to break the r600g dependency on AMDGPU yet.
> I'm headed out of town for a long weekend and don't feel like risking the
> push before being gone for five days.
>
> I've got a v3 of Emil's patch 4/5 that removes the AMDGPU header dependency
> from r600 and I'm good with the status of Jan's 3-patch series.  I'm hoping
> we can square that away early next week unless is gets resolved while I'm
> gone.
>
I've double-checked and Jan's 1-3 (squashed 2+3) alongside my 4-5
resolve all the issues I could notice.
Pushed the lot and I'll parse through patchwork in a moment.

Thanks
Emil
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600/AMDGPU fixes for Clover

2017-06-15 Thread Aaron Watry
Hey all,

We haven't landed the fixes to break the r600g dependency on AMDGPU yet.
I'm headed out of town for a long weekend and don't feel like risking the
push before being gone for five days.

I've got a v3 of Emil's patch 4/5 that removes the AMDGPU header dependency
from r600 and I'm good with the status of Jan's 3-patch series.  I'm hoping
we can square that away early next week unless is gets resolved while I'm
gone.

--Aaron
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600: some prelim fixes/patches for atomics

2017-06-07 Thread Dave Airlie
These are just some minor prelim patches for the GL4.3 work, that 
looked easy to split out.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600: Regarding "Failed to build shader (translation from TGSI) #99349

2017-05-29 Thread Gert Wollny
Hello all, 


Hardware; Radeon 6850HD, 
Mesa: mesa 17.0.1 and git (sha 531887), 
llvm:
4.0.0

Playing a bit around with the Unreal Editor I was confronted with the
same error message reported in #99349, i.e. "Failed to build shader
(translation from TGSI). 

After some digging though the code I found that the TGSI code [1] of
the offending shader reserves 151 temporaries so that the available 128
GPRs are already allocated right from the start, and when the operation
"MUL TEMP[11], CONST[26], CONST[23]" is translated to byte code,
both constants are read from the cfile region, because 
tgsi_split_constant could not move one constant to a proper GPR.

As one can see in the TGSI dump [1], the shader does not really use 151
temporaries, only 40 are actually also addresses as source, to all the
other temps values are just written once (assuming the the TGSI
notation is OP DEST, SRC0, SRC1 ...). 

My questions are now: 

Does the GSLS-TGSI stage of the compilation do any optimizations?
Specifically, should the unused temporaries be eliminated in that step
and that I get this TGSI-dump is actually a bug in this compilation
stage? (In the Gallium3D wikipedia article [2] it is written that there
is a TGSI optimization stage.) 

As far as I understand there is a optimization pass done after the TGSI
translation, but because of the nature of the problem the shader is
rejected before. Would it make sense to implement a patch that would
work around this problem by reserving some GORs to move constants to
(and the temporary that is now ctx.temp_reg), and then test the number
of allocated registers only after the byte code optimization?  I
partially implemented something like this [3] when I tried to find the
source of the bug, so I could clean that up and propose a patch, so far
the graphical output is clobbered though. 


many thank, 
Gert 





[1]  https://bugs.freedesktop.org/attachment.cgi?id=131567 
(12kb, xz compressed)

[2] https://en.wikipedia.org/wiki/Gallium3D#Tungsten_Graphics_Shader_In
frastructure

[3] https://github.com/gerddie/mesa
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/compute: cleanup evergreen_compute.c

2016-04-06 Thread Tom Stellard
On Wed, Apr 06, 2016 at 10:40:50PM +0100, Dave Airlie wrote:
> This probably should have been cleaned up before merging, but we
> were a bit lax with it. This is a bunch of cleanups and changes,
> that make adding ARB_compute_support less of a task.
> 

Acked-by: Tom Stellard 

> Dave.
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/compute: cleanup evergreen_compute.c

2016-04-06 Thread eocallaghan

Nice cleanup. This series is,

Reviewed-by: Edward O'Callaghan 


On 2016-04-07 07:40, Dave Airlie wrote:

This probably should have been cleaned up before merging, but we
were a bit lax with it. This is a bunch of cleanups and changes,
that make adding ARB_compute_support less of a task.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600/compute: cleanup evergreen_compute.c

2016-04-06 Thread Dave Airlie
This probably should have been cleaned up before merging, but we
were a bit lax with it. This is a bunch of cleanups and changes,
that make adding ARB_compute_support less of a task.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 tess branches updated

2015-12-06 Thread Grazvydas Ignotas
Hi,

On Fri, Dec 4, 2015 at 6:19 AM, Dave Airlie  wrote:
> Hey all,
>
> I've pushed an updated version of the r600g tess support to my
> r600g-tess-submit branch.

FWIW:
Tested-by: Grazvydas Ignotas 
on JUNIPER XT with heaven and piglit, no issues noticed.

Gražvydas
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600 tess branches updated

2015-12-03 Thread Dave Airlie
Hey all,

I've pushed an updated version of the r600g tess support to my
r600g-tess-submit branch.

I'm in two minds whether we need to spam the list again,

I think I've included all the review feedback so far, thanks to
everyone that looked.

The major changes since the last posting are:

use 24-bit math operations for LDS index calculations.
CAICOS/SUMO thread count changes - seems to make heaven run
dropping pointless delay slots in LDS reads
attempt to calculate SQ_LDS_ALLOC.HS_NUM_WAVES properly
don't reeemit the LDS constant buffers if we don't have to.
fix sb GDS decoder as per Glenn's request
fix some minor bugs in the previous submit branch.

I'll probably line to push this all next week unless anyone can find
an objection!.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600 geometry shader workarounds

2015-11-09 Thread Dave Airlie
I've had these sitting locally and heiko on #dri-devel found
they fixed some issues for him.

Marek provided me with some errata and this is the results of
implementing them.

It is nearly all fixes for r600 era hw.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 multiple stream support + one misc debug patch

2015-08-25 Thread Edward O'Callaghan
This patch series is:

Reviewed-by: Edward O'Callaghan eocallag...@alterapraxis.com

P.S. thanks for polishing it Dave!

-- 
  Edward O'Callaghan
  edward.ocallag...@koparo.com

On Tue, Aug 25, 2015, at 11:18 AM, Dave Airlie wrote:
 This adds multiple stream support for ARB_gpu_shader5, and one
 other patch.
 
 It doesn't expose ARB_gpu_shader5 yet, as I think we'd like
 to try and get SB support for it into some sort of shape first.
 
 Dave.
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600 multiple stream support + one misc debug patch

2015-08-24 Thread Dave Airlie
This adds multiple stream support for ARB_gpu_shader5, and one
other patch.

It doesn't expose ARB_gpu_shader5 yet, as I think we'd like
to try and get SB support for it into some sort of shape first.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-16 Thread Vadim Girlin

On 12/16/2014 05:44 AM, Dave Airlie wrote:

On 16 December 2014 at 08:59, Vadim Girlin vadimgir...@gmail.com wrote:

On 12/16/2014 01:30 AM, Dave Airlie wrote:




New patch is attached, the only difference is in the sb_sched.cpp (it
disables copy coalescing for some unsafe cases, so it may leave more
MOVs
than previously, but I don't think there will be any noticeable effect
on
performance).

So far I don't see any problems with it, but I don't have many GL apps
on
the test machine. At least lightsmark and unigine demos work for me.



Based on my limited understanding of the code:

Acked-by: Alex Deucher alexander.deuc...@amd.com




Alex, thanks for the review, I understand you wanted it to get into mesa
release, but it really needs careful testing with more apps, so far I
hoped
Dave would do it as long as he's looking into these issues anyway. In
theory
I can also install steam on the test machine and some games, it just
needs
the time and I'm not sure if I'll find it, so far my main job is
sufficient
to make me pretty tired.

Current scheduler in SB is very fragile after adding handling for all
special cases discovered during initial debugging etc, I said since the
very
beginning that I'd like to rewrite it, if only I had time. So any change
like this can potentially break some apps even if piglit passes, and I'm
not
ready to take responsibility for that if I commit it myself, I just don't
have time to deal with all possible consequences on all supported chips.

If you think it's ok, just push this patch (it requires revert of the
previous Dave's commit 7b0067d2). I'm really sorry that I can't do more
to
help with it.



Myself and Glenn are looking at it, Glenn noticed a piglit regression
from this yesterday, I'll reproduce today and take a look.



Hi, Dave  Glenn,

Thanks for looking into it. FWIW, when I worked on it I've ran piglit's
quick tests and didn't see any regressions on evergreen (juniper 5750).
There were some failed tests in some piglit runs, but AFAIU they were just
random.


Turns out we had a pre-existing fail that we noticed, not a regression.

I'm going to push this, since its better than what is there, we can
see if some public testing notices any big issues also.


Thanks, Dave. I'm really sorry that I can't pay as much attention to 
that code as I'd like, and I really appreciate your and Glenn's efforts 
for maintaining it.


(In case if someone thinks it's my fault, I must remind, I warned that I 
won't be able to support it even before it was merged. So please don't 
blame me :) ).

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600 atomics integration

2014-12-16 Thread Aditya Avinash
Hi,
I am trying to integrate atomic counters to Mesa. I was able to do until
pipes (with the help of Marek and Ilia). How to integrate them to R600?
Thank you!



-- 
Regards,

*Aditya Atluri,*

*USA.*
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-15 Thread Vadim Girlin

On 12/12/2014 05:28 PM, Alex Deucher wrote:

On Wed, Dec 10, 2014 at 6:50 AM, Vadim Girlin vadimgir...@gmail.com wrote:

On 12/09/2014 07:39 AM, Vadim Girlin wrote:


On 12/09/2014 05:18 AM, Dave Airlie wrote:


On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:


On 12/06/2014 07:13 AM, Vadim Girlin wrote:



On 12/04/2014 01:43 AM, Dave Airlie wrote:



Hi Vadim,

I've been looking with Glenn's help into a bug in sb for a couple of
weeks now triggered by a change in how GLSL generates switch
statements.

I understand you probably aren't too interested in r600g but I believe
I'm hitting a design level problem and I would like some advice.

So it appears that GLSL can create loops that don't repeat for switch
statements, and it appears SB wasn't ready to handle such a thing.




Hi, Dave,

I suspect we should rather get rid of such loops somehow, i.e. convert
to something else, the loop that never repeats is not really a loop
anyway. AFAICS continue is not supported in switch statements
according to GLSL specs, so the loops generated for switch will
never be
repeated. Am I missing something? Even if repeating is possible
somehow,
at least we can get rid of the loops that are not repeated.

I think loops are less efficient than other control flow
instructions on
r600g hw (at least because they increase stack usage), and possibly on
other hw too.

In fact it seems sb basically gets rid of it already in IR, it just
doesn't know how to translate resulting control flow to ISA, because so
far it only supports specific control flow structure for if-then-else
that was previously preserved during optimizations. I think it may be
not very hard to implement support for that in finalizer, I'll look
into
it.




In fact handling that control flow in finalizer is not as easy as I
hoped,
probably impossible, at least if we want to make it efficient. I forgot
about the limitations of R600 ISA.

OTOH it seems I've managed to fix the issues with loops, the patch is
attached (it's meant to be used instead of 7b0067d2). There are no
piglit
regressions on evergreen, but I didn't test any real apps.


This does seem to fix the problems in piglit, and looks close to what
I was attempting but written by someone who knows what they are doing :-)

What is the sb_sched.cpp change for at the end for?



It fixes those scheduler/regalloc errors for switch tests.

Unfortunately, now I've installed some benchmarks for testing and AFAICS
this patch breaks at least lightsmark 2008, so it seems the condition
removed by the patch was there for a reason.

I'll probably try to come up with better fix.



New patch is attached, the only difference is in the sb_sched.cpp (it
disables copy coalescing for some unsafe cases, so it may leave more MOVs
than previously, but I don't think there will be any noticeable effect on
performance).

So far I don't see any problems with it, but I don't have many GL apps on
the test machine. At least lightsmark and unigine demos work for me.



Based on my limited understanding of the code:

Acked-by: Alex Deucher alexander.deuc...@amd.com


Alex, thanks for the review, I understand you wanted it to get into mesa 
release, but it really needs careful testing with more apps, so far I 
hoped Dave would do it as long as he's looking into these issues anyway. 
In theory I can also install steam on the test machine and some games, 
it just needs the time and I'm not sure if I'll find it, so far my main 
job is sufficient to make me pretty tired.


Current scheduler in SB is very fragile after adding handling for all 
special cases discovered during initial debugging etc, I said since the 
very beginning that I'd like to rewrite it, if only I had time. So any 
change like this can potentially break some apps even if piglit passes, 
and I'm not ready to take responsibility for that if I commit it myself, 
I just don't have time to deal with all possible consequences on all 
supported chips.


If you think it's ok, just push this patch (it requires revert of the 
previous Dave's commit 7b0067d2). I'm really sorry that I can't do more 
to help with it.


Vadim





Vadim




Vadim



Dave.






___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-15 Thread Dave Airlie


 New patch is attached, the only difference is in the sb_sched.cpp (it
 disables copy coalescing for some unsafe cases, so it may leave more
 MOVs
 than previously, but I don't think there will be any noticeable effect on
 performance).

 So far I don't see any problems with it, but I don't have many GL apps on
 the test machine. At least lightsmark and unigine demos work for me.


 Based on my limited understanding of the code:

 Acked-by: Alex Deucher alexander.deuc...@amd.com


 Alex, thanks for the review, I understand you wanted it to get into mesa
 release, but it really needs careful testing with more apps, so far I hoped
 Dave would do it as long as he's looking into these issues anyway. In theory
 I can also install steam on the test machine and some games, it just needs
 the time and I'm not sure if I'll find it, so far my main job is sufficient
 to make me pretty tired.

 Current scheduler in SB is very fragile after adding handling for all
 special cases discovered during initial debugging etc, I said since the very
 beginning that I'd like to rewrite it, if only I had time. So any change
 like this can potentially break some apps even if piglit passes, and I'm not
 ready to take responsibility for that if I commit it myself, I just don't
 have time to deal with all possible consequences on all supported chips.

 If you think it's ok, just push this patch (it requires revert of the
 previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to
 help with it.

Myself and Glenn are looking at it, Glenn noticed a piglit regression
from this yesterday, I'll reproduce today and take a look.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-15 Thread Vadim Girlin

On 12/16/2014 01:30 AM, Dave Airlie wrote:



New patch is attached, the only difference is in the sb_sched.cpp (it
disables copy coalescing for some unsafe cases, so it may leave more
MOVs
than previously, but I don't think there will be any noticeable effect on
performance).

So far I don't see any problems with it, but I don't have many GL apps on
the test machine. At least lightsmark and unigine demos work for me.



Based on my limited understanding of the code:

Acked-by: Alex Deucher alexander.deuc...@amd.com



Alex, thanks for the review, I understand you wanted it to get into mesa
release, but it really needs careful testing with more apps, so far I hoped
Dave would do it as long as he's looking into these issues anyway. In theory
I can also install steam on the test machine and some games, it just needs
the time and I'm not sure if I'll find it, so far my main job is sufficient
to make me pretty tired.

Current scheduler in SB is very fragile after adding handling for all
special cases discovered during initial debugging etc, I said since the very
beginning that I'd like to rewrite it, if only I had time. So any change
like this can potentially break some apps even if piglit passes, and I'm not
ready to take responsibility for that if I commit it myself, I just don't
have time to deal with all possible consequences on all supported chips.

If you think it's ok, just push this patch (it requires revert of the
previous Dave's commit 7b0067d2). I'm really sorry that I can't do more to
help with it.


Myself and Glenn are looking at it, Glenn noticed a piglit regression
from this yesterday, I'll reproduce today and take a look.


Hi, Dave  Glenn,

Thanks for looking into it. FWIW, when I worked on it I've ran piglit's 
quick tests and didn't see any regressions on evergreen (juniper 5750). 
There were some failed tests in some piglit runs, but AFAIU they were 
just random.


If there are any problems with this fix, I'll be glad to try to help, if 
time allows.


Vadim




Dave.



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-15 Thread Dave Airlie
On 16 December 2014 at 08:59, Vadim Girlin vadimgir...@gmail.com wrote:
 On 12/16/2014 01:30 AM, Dave Airlie wrote:



 New patch is attached, the only difference is in the sb_sched.cpp (it
 disables copy coalescing for some unsafe cases, so it may leave more
 MOVs
 than previously, but I don't think there will be any noticeable effect
 on
 performance).

 So far I don't see any problems with it, but I don't have many GL apps
 on
 the test machine. At least lightsmark and unigine demos work for me.


 Based on my limited understanding of the code:

 Acked-by: Alex Deucher alexander.deuc...@amd.com



 Alex, thanks for the review, I understand you wanted it to get into mesa
 release, but it really needs careful testing with more apps, so far I
 hoped
 Dave would do it as long as he's looking into these issues anyway. In
 theory
 I can also install steam on the test machine and some games, it just
 needs
 the time and I'm not sure if I'll find it, so far my main job is
 sufficient
 to make me pretty tired.

 Current scheduler in SB is very fragile after adding handling for all
 special cases discovered during initial debugging etc, I said since the
 very
 beginning that I'd like to rewrite it, if only I had time. So any change
 like this can potentially break some apps even if piglit passes, and I'm
 not
 ready to take responsibility for that if I commit it myself, I just don't
 have time to deal with all possible consequences on all supported chips.

 If you think it's ok, just push this patch (it requires revert of the
 previous Dave's commit 7b0067d2). I'm really sorry that I can't do more
 to
 help with it.


 Myself and Glenn are looking at it, Glenn noticed a piglit regression
 from this yesterday, I'll reproduce today and take a look.


 Hi, Dave  Glenn,

 Thanks for looking into it. FWIW, when I worked on it I've ran piglit's
 quick tests and didn't see any regressions on evergreen (juniper 5750).
 There were some failed tests in some piglit runs, but AFAIU they were just
 random.

Turns out we had a pre-existing fail that we noticed, not a regression.

I'm going to push this, since its better than what is there, we can
see if some public testing notices any big issues also.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-12 Thread Alex Deucher
On Wed, Dec 10, 2014 at 6:50 AM, Vadim Girlin vadimgir...@gmail.com wrote:
 On 12/09/2014 07:39 AM, Vadim Girlin wrote:

 On 12/09/2014 05:18 AM, Dave Airlie wrote:

 On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:

 On 12/06/2014 07:13 AM, Vadim Girlin wrote:


 On 12/04/2014 01:43 AM, Dave Airlie wrote:


 Hi Vadim,

 I've been looking with Glenn's help into a bug in sb for a couple of
 weeks now triggered by a change in how GLSL generates switch
 statements.

 I understand you probably aren't too interested in r600g but I believe
 I'm hitting a design level problem and I would like some advice.

 So it appears that GLSL can create loops that don't repeat for switch
 statements, and it appears SB wasn't ready to handle such a thing.



 Hi, Dave,

 I suspect we should rather get rid of such loops somehow, i.e. convert
 to something else, the loop that never repeats is not really a loop
 anyway. AFAICS continue is not supported in switch statements
 according to GLSL specs, so the loops generated for switch will
 never be
 repeated. Am I missing something? Even if repeating is possible
 somehow,
 at least we can get rid of the loops that are not repeated.

 I think loops are less efficient than other control flow
 instructions on
 r600g hw (at least because they increase stack usage), and possibly on
 other hw too.

 In fact it seems sb basically gets rid of it already in IR, it just
 doesn't know how to translate resulting control flow to ISA, because so
 far it only supports specific control flow structure for if-then-else
 that was previously preserved during optimizations. I think it may be
 not very hard to implement support for that in finalizer, I'll look
 into
 it.



 In fact handling that control flow in finalizer is not as easy as I
 hoped,
 probably impossible, at least if we want to make it efficient. I forgot
 about the limitations of R600 ISA.

 OTOH it seems I've managed to fix the issues with loops, the patch is
 attached (it's meant to be used instead of 7b0067d2). There are no
 piglit
 regressions on evergreen, but I didn't test any real apps.

 This does seem to fix the problems in piglit, and looks close to what
 I was attempting but written by someone who knows what they are doing :-)

 What is the sb_sched.cpp change for at the end for?


 It fixes those scheduler/regalloc errors for switch tests.

 Unfortunately, now I've installed some benchmarks for testing and AFAICS
 this patch breaks at least lightsmark 2008, so it seems the condition
 removed by the patch was there for a reason.

 I'll probably try to come up with better fix.


 New patch is attached, the only difference is in the sb_sched.cpp (it
 disables copy coalescing for some unsafe cases, so it may leave more MOVs
 than previously, but I don't think there will be any noticeable effect on
 performance).

 So far I don't see any problems with it, but I don't have many GL apps on
 the test machine. At least lightsmark and unigine demos work for me.


Based on my limited understanding of the code:

Acked-by: Alex Deucher alexander.deuc...@amd.com

 Vadim



 Vadim


 Dave.




 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-10 Thread Vadim Girlin

On 12/09/2014 07:39 AM, Vadim Girlin wrote:

On 12/09/2014 05:18 AM, Dave Airlie wrote:

On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:

On 12/06/2014 07:13 AM, Vadim Girlin wrote:


On 12/04/2014 01:43 AM, Dave Airlie wrote:


Hi Vadim,

I've been looking with Glenn's help into a bug in sb for a couple of
weeks now triggered by a change in how GLSL generates switch
statements.

I understand you probably aren't too interested in r600g but I believe
I'm hitting a design level problem and I would like some advice.

So it appears that GLSL can create loops that don't repeat for switch
statements, and it appears SB wasn't ready to handle such a thing.



Hi, Dave,

I suspect we should rather get rid of such loops somehow, i.e. convert
to something else, the loop that never repeats is not really a loop
anyway. AFAICS continue is not supported in switch statements
according to GLSL specs, so the loops generated for switch will
never be
repeated. Am I missing something? Even if repeating is possible
somehow,
at least we can get rid of the loops that are not repeated.

I think loops are less efficient than other control flow
instructions on
r600g hw (at least because they increase stack usage), and possibly on
other hw too.

In fact it seems sb basically gets rid of it already in IR, it just
doesn't know how to translate resulting control flow to ISA, because so
far it only supports specific control flow structure for if-then-else
that was previously preserved during optimizations. I think it may be
not very hard to implement support for that in finalizer, I'll look
into
it.



In fact handling that control flow in finalizer is not as easy as I
hoped,
probably impossible, at least if we want to make it efficient. I forgot
about the limitations of R600 ISA.

OTOH it seems I've managed to fix the issues with loops, the patch is
attached (it's meant to be used instead of 7b0067d2). There are no
piglit
regressions on evergreen, but I didn't test any real apps.


This does seem to fix the problems in piglit, and looks close to what
I was attempting but written by someone who knows what they are doing :-)

What is the sb_sched.cpp change for at the end for?


It fixes those scheduler/regalloc errors for switch tests.

Unfortunately, now I've installed some benchmarks for testing and AFAICS
this patch breaks at least lightsmark 2008, so it seems the condition
removed by the patch was there for a reason.

I'll probably try to come up with better fix.


New patch is attached, the only difference is in the sb_sched.cpp (it 
disables copy coalescing for some unsafe cases, so it may leave more 
MOVs than previously, but I don't think there will be any noticeable 
effect on performance).


So far I don't see any problems with it, but I don't have many GL apps 
on the test machine. At least lightsmark and unigine demos work for me.


Vadim




Vadim



Dave.





From d2d16fa39c7b4e871d67e05bad92a540d7e5ea68 Mon Sep 17 00:00:00 2001
From: Vadim Girlin vadimgir...@gmail.com
Date: Wed, 10 Dec 2014 14:41:10 +0300
Subject: [PATCH] r600g/sb: fix issues with loops created for switch

---
 src/gallium/drivers/r600/sb/sb_bc_finalize.cpp   | 2 ++
 src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 2 ++
 src/gallium/drivers/r600/sb/sb_if_conversion.cpp | 4 ++--
 src/gallium/drivers/r600/sb/sb_ir.h  | 9 +++--
 src/gallium/drivers/r600/sb/sb_sched.cpp | 3 +++
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
index f0849ca..3f362c4 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
@@ -110,6 +110,8 @@ int bc_finalizer::run() {
 
 void bc_finalizer::finalize_loop(region_node* r) {
 
+	update_nstack(r);
+
 	cf_node *loop_start = sh.create_cf(CF_OP_LOOP_START_DX10);
 	cf_node *loop_end = sh.create_cf(CF_OP_LOOP_END);
 
diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
index d787e5b..403f938 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
@@ -758,6 +758,8 @@ int bc_parser::prepare_loop(cf_node* c) {
 	c-insert_before(reg);
 	rep-move(c, end-next);
 
+	reg-src_loop = true;
+
 	loop_stack.push(reg);
 	return 0;
 }
diff --git a/src/gallium/drivers/r600/sb/sb_if_conversion.cpp b/src/gallium/drivers/r600/sb/sb_if_conversion.cpp
index 93edace..3f2b1b1 100644
--- a/src/gallium/drivers/r600/sb/sb_if_conversion.cpp
+++ b/src/gallium/drivers/r600/sb/sb_if_conversion.cpp
@@ -115,13 +115,13 @@ void if_conversion::convert_kill_instructions(region_node *r,
 bool if_conversion::check_and_convert(region_node *r) {
 
 	depart_node *nd1 = static_castdepart_node*(r-first);
-	if (!nd1-is_depart())
+	if (!nd1-is_depart() || nd1-target != r)
 		return false;
 	if_node *nif = static_castif_node*(nd1-first);
 	if (!nif-is_if())
 		

Re: [Mesa-dev] r600/sb loop issue

2014-12-08 Thread Vadim Girlin

On 12/06/2014 07:13 AM, Vadim Girlin wrote:

On 12/04/2014 01:43 AM, Dave Airlie wrote:

Hi Vadim,

I've been looking with Glenn's help into a bug in sb for a couple of
weeks now triggered by a change in how GLSL generates switch
statements.

I understand you probably aren't too interested in r600g but I believe
I'm hitting a design level problem and I would like some advice.

So it appears that GLSL can create loops that don't repeat for switch
statements, and it appears SB wasn't ready to handle such a thing.


Hi, Dave,

I suspect we should rather get rid of such loops somehow, i.e. convert
to something else, the loop that never repeats is not really a loop
anyway. AFAICS continue is not supported in switch statements
according to GLSL specs, so the loops generated for switch will never be
repeated. Am I missing something? Even if repeating is possible somehow,
at least we can get rid of the loops that are not repeated.

I think loops are less efficient than other control flow instructions on
r600g hw (at least because they increase stack usage), and possibly on
other hw too.

In fact it seems sb basically gets rid of it already in IR, it just
doesn't know how to translate resulting control flow to ISA, because so
far it only supports specific control flow structure for if-then-else
that was previously preserved during optimizations. I think it may be
not very hard to implement support for that in finalizer, I'll look into
it.


In fact handling that control flow in finalizer is not as easy as I 
hoped, probably impossible, at least if we want to make it efficient. I 
forgot about the limitations of R600 ISA.


OTOH it seems I've managed to fix the issues with loops, the patch is 
attached (it's meant to be used instead of 7b0067d2). There are no 
piglit regressions on evergreen, but I didn't test any real apps.


Vadim






sb has the -is_loop() and it just checks !repeats.empty(), so this
meant in the finalizer code we'd fall into the if statement which
would then assert.

I hacked/fixed (more hacked), this in
7b0067d23a6f64cf83c42e7f11b2cd4100c569fe
which attempts to detect single pass loops and handle things that way.

However this lead to stack depth calculations being incorrectly done,
so I moved the single loop detect into the is_loop check, (see
attached patch).

This fixes the rendering in some places, but lead to a regression in
tests/shaders/glsl-vs-continue-in-switch-in-do-while.shader_test
error at : PHI t76||FP@R3.x,   t128||FP@R3.x, t115||FP@R3.x,
t102||FP@R3.x, t89||FP@R3.x : expected
 operand value t115||FP@R3.x, gpr contains t17||FP@R3.x
error at : PHI t76||FP@R3.x,   t128||FP@R3.x, t115||FP@R3.x,
t102||FP@R3.x, t89||FP@R3.x : expected
 operand value t102||FP@R3.x, gpr contains t17||FP@R3.x

Now Glenn suspected this was due to the is_loop check in
sb_shader.cpp:create_bbs,
and changing that check to only detect repeating loops removes that
issue,
but introduces stack sizing issues again, resulting in lockups/random
rendering.

So I just want to ask had you considered single loops with an always
break in sb design,


I didn't see such loops with any test cases, so I didn't even think
about it.


and perhaps some idea where things are going so wrong with the
register alloc above.


Not sure, but as long as the only repeat node is optimized away in
bc_parser because it's useless due to unconditional break, I suspect it
may be not easy to make all other code think that it's still a loop.

I've tried a quick fix to not optimize the repeat away for such loops,
but it results in other issues, probably it will require handling this
as a special case in other places, so it doesn't look like a good idea
either.

I'll try to implement the solution that I described above, that is,
translate resulting control flow back to ISA. If it won't be too much
work, it's probably the best way and it won't use loop instructions in
the end.



I suspect I'll keep digging into this, but its getting to the edges of
the brain space/time I can find!

Dave.





From 4967ef90847f921fc0ef7c018ae7ae8048d2a6ce Mon Sep 17 00:00:00 2001
From: Vadim Girlin vadimgir...@gmail.com
Date: Mon, 8 Dec 2014 13:11:48 +0300
Subject: [PATCH] r600g/sb: fix issues with loops created for switch statements

---
 src/gallium/drivers/r600/sb/sb_bc_finalize.cpp   | 2 ++
 src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 2 ++
 src/gallium/drivers/r600/sb/sb_if_conversion.cpp | 4 ++--
 src/gallium/drivers/r600/sb/sb_ir.h  | 9 +++--
 src/gallium/drivers/r600/sb/sb_sched.cpp | 2 +-
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
index f0849ca..3f362c4 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
@@ -110,6 +110,8 @@ int bc_finalizer::run() {
 
 void bc_finalizer::finalize_loop(region_node* 

Re: [Mesa-dev] r600/sb loop issue

2014-12-08 Thread Dave Airlie
On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:
 On 12/06/2014 07:13 AM, Vadim Girlin wrote:

 On 12/04/2014 01:43 AM, Dave Airlie wrote:

 Hi Vadim,

 I've been looking with Glenn's help into a bug in sb for a couple of
 weeks now triggered by a change in how GLSL generates switch
 statements.

 I understand you probably aren't too interested in r600g but I believe
 I'm hitting a design level problem and I would like some advice.

 So it appears that GLSL can create loops that don't repeat for switch
 statements, and it appears SB wasn't ready to handle such a thing.


 Hi, Dave,

 I suspect we should rather get rid of such loops somehow, i.e. convert
 to something else, the loop that never repeats is not really a loop
 anyway. AFAICS continue is not supported in switch statements
 according to GLSL specs, so the loops generated for switch will never be
 repeated. Am I missing something? Even if repeating is possible somehow,
 at least we can get rid of the loops that are not repeated.

 I think loops are less efficient than other control flow instructions on
 r600g hw (at least because they increase stack usage), and possibly on
 other hw too.

 In fact it seems sb basically gets rid of it already in IR, it just
 doesn't know how to translate resulting control flow to ISA, because so
 far it only supports specific control flow structure for if-then-else
 that was previously preserved during optimizations. I think it may be
 not very hard to implement support for that in finalizer, I'll look into
 it.


 In fact handling that control flow in finalizer is not as easy as I hoped,
 probably impossible, at least if we want to make it efficient. I forgot
 about the limitations of R600 ISA.

 OTOH it seems I've managed to fix the issues with loops, the patch is
 attached (it's meant to be used instead of 7b0067d2). There are no piglit
 regressions on evergreen, but I didn't test any real apps.


This fixes one thing, but the switches are still broken here on cayman at least

tests/spec/glsl-1.30/execution/switch/fs-default_last.shader_test

--
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL OUT[0], COLOR
DCL CONST[0]
DCL TEMP[0..2], LOCAL
IMM[0] FLT32 {0., 1., 0., 0.}
IMM[1] UINT32 {0, 4294967295, 0, 0}
IMM[2] INT32 {1, 0, 0, 0}
  0: MOV TEMP[0], IMM[0].
  1: MOV TEMP[1].x, IMM[1].
  2: BGNLOOP :0
  3:   UCMP TEMP[1].x, CONST[0]., TEMP[1]., IMM[1].
  4:   UIF TEMP[1]. :0
  5: MOV TEMP[0].x, IMM[0].
  6: BRK
  7:   ENDIF
  8:   USEQ TEMP[2].x, IMM[2]., CONST[0].
  9:   UCMP TEMP[1].x, TEMP[2]., IMM[1]., TEMP[1].
 10:   UIF TEMP[1]. :0
 11: MOV TEMP[0].y, IMM[0].
 12: BRK
 13:   ENDIF
 14:   MOV TEMP[1].x, IMM[1].
 15:   MOV TEMP[0].z, IMM[0].
 16:   BRK
 17: ENDLOOP :0
 18: MOV OUT[0], TEMP[0]
 19: END

= SHADER #13  PS/CAYMAN/CAYMAN =
= 72 dw = 6 gprs = 2 stack =
  0012 a010 ALU 5 @36
 0036  00f8 00200c90 1  x: MOVR1.x,  0
 0038  00f8 20200c90y: MOVR1.y,  0
 0040  00f8 40200c90z: MOVR1.z,  0
 0042  80f8 60200c90w: MOVR1.w,  0
 0044  80f8 00400c90 2  x: MOVR2.x,  0
0002  000f 8180 LOOP_START_DX10 @30
0004  4017 a404 ALU_PUSH_BEFORE 2 @46 KC0[CB0:0-15]
 0046  809f6080 0043c002 3  x: CNDGE_INT  R2.x,
KC0[0].x, -1, R2.x
 0048  801f00fe 00a0229c 4 MP   x: PRED_SETNE_INT R5.x,  PV.x, 0
0006  0007 8281 JUMP @14 POP:1
0008  0019 a000 ALU 1 @50
 0050  84f9 00200c90 5  x: MOVR1.x,  1.0
0010  000e 8240 LOOP_BREAK @28
0012  0007 8381 POP @14 POP:1
0014  401a a408 ALU_PUSH_BEFORE 3 @52 KC0[CB0:0-15]
 0052  801000fa 00601d10 6  x: SETE_INT   R3.x,  1, KC0[0].x
 0054  800040fe 0043c4fb 7  x: CNDGE_INT  R2.x,  PV.x, R2.x, -1
 0056  801f00fe 00a0229c 8 MP   x: PRED_SETNE_INT R5.x,  PV.x, 0
0016  000c 8281 JUMP @24 POP:1
0018  001d a000 ALU 1 @58
 0058  84f9 20200c90 9  y: MOVR1.y,  1.0
0020  000e 8240 LOOP_BREAK @28
0022  000c 8381 POP @24 POP:1
0024  001e a004 ALU 2 @60
 0060  04fb 00400c9010  x: MOVR2.x,  -1
 0062  84f9 40200c90z: MOVR1.z,  1.0
0026  000e 8240 LOOP_BREAK @28
0028  0002 8140 LOOP_END @4
0030  0020 a00c ALU 4 @64
 0064  0001 0c9011  x: MOVR0.x,  R1.x
 0066  0401 2c90y: MOVR0.y,  R1.y
 0068  0801 4c90z: MOVR0.z,  R1.z
 0070  8c01 6c90w: MOV   

Re: [Mesa-dev] r600/sb loop issue

2014-12-08 Thread Dave Airlie
On 9 December 2014 at 10:25, Dave Airlie airl...@gmail.com wrote:
 On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:
 On 12/06/2014 07:13 AM, Vadim Girlin wrote:

 On 12/04/2014 01:43 AM, Dave Airlie wrote:

 Hi Vadim,

 I've been looking with Glenn's help into a bug in sb for a couple of
 weeks now triggered by a change in how GLSL generates switch
 statements.

 I understand you probably aren't too interested in r600g but I believe
 I'm hitting a design level problem and I would like some advice.

 So it appears that GLSL can create loops that don't repeat for switch
 statements, and it appears SB wasn't ready to handle such a thing.


 Hi, Dave,

 I suspect we should rather get rid of such loops somehow, i.e. convert
 to something else, the loop that never repeats is not really a loop
 anyway. AFAICS continue is not supported in switch statements
 according to GLSL specs, so the loops generated for switch will never be
 repeated. Am I missing something? Even if repeating is possible somehow,
 at least we can get rid of the loops that are not repeated.

 I think loops are less efficient than other control flow instructions on
 r600g hw (at least because they increase stack usage), and possibly on
 other hw too.

 In fact it seems sb basically gets rid of it already in IR, it just
 doesn't know how to translate resulting control flow to ISA, because so
 far it only supports specific control flow structure for if-then-else
 that was previously preserved during optimizations. I think it may be
 not very hard to implement support for that in finalizer, I'll look into
 it.


 In fact handling that control flow in finalizer is not as easy as I hoped,
 probably impossible, at least if we want to make it efficient. I forgot
 about the limitations of R600 ISA.

 OTOH it seems I've managed to fix the issues with loops, the patch is
 attached (it's meant to be used instead of 7b0067d2). There are no piglit
 regressions on evergreen, but I didn't test any real apps.


 This fixes one thing, but the switches are still broken here on cayman at 
 least

Actually ignore that, another regression snuck into r600g that I had to fix.

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-08 Thread Dave Airlie
On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:
 On 12/06/2014 07:13 AM, Vadim Girlin wrote:

 On 12/04/2014 01:43 AM, Dave Airlie wrote:

 Hi Vadim,

 I've been looking with Glenn's help into a bug in sb for a couple of
 weeks now triggered by a change in how GLSL generates switch
 statements.

 I understand you probably aren't too interested in r600g but I believe
 I'm hitting a design level problem and I would like some advice.

 So it appears that GLSL can create loops that don't repeat for switch
 statements, and it appears SB wasn't ready to handle such a thing.


 Hi, Dave,

 I suspect we should rather get rid of such loops somehow, i.e. convert
 to something else, the loop that never repeats is not really a loop
 anyway. AFAICS continue is not supported in switch statements
 according to GLSL specs, so the loops generated for switch will never be
 repeated. Am I missing something? Even if repeating is possible somehow,
 at least we can get rid of the loops that are not repeated.

 I think loops are less efficient than other control flow instructions on
 r600g hw (at least because they increase stack usage), and possibly on
 other hw too.

 In fact it seems sb basically gets rid of it already in IR, it just
 doesn't know how to translate resulting control flow to ISA, because so
 far it only supports specific control flow structure for if-then-else
 that was previously preserved during optimizations. I think it may be
 not very hard to implement support for that in finalizer, I'll look into
 it.


 In fact handling that control flow in finalizer is not as easy as I hoped,
 probably impossible, at least if we want to make it efficient. I forgot
 about the limitations of R600 ISA.

 OTOH it seems I've managed to fix the issues with loops, the patch is
 attached (it's meant to be used instead of 7b0067d2). There are no piglit
 regressions on evergreen, but I didn't test any real apps.

This does seem to fix the problems in piglit, and looks close to what
I was attempting but written by someone who knows what they are doing :-)

What is the sb_sched.cpp change for at the end for?

Dave.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-08 Thread Vadim Girlin

On 12/09/2014 05:18 AM, Dave Airlie wrote:

On 8 December 2014 at 20:41, Vadim Girlin vadimgir...@gmail.com wrote:

On 12/06/2014 07:13 AM, Vadim Girlin wrote:


On 12/04/2014 01:43 AM, Dave Airlie wrote:


Hi Vadim,

I've been looking with Glenn's help into a bug in sb for a couple of
weeks now triggered by a change in how GLSL generates switch
statements.

I understand you probably aren't too interested in r600g but I believe
I'm hitting a design level problem and I would like some advice.

So it appears that GLSL can create loops that don't repeat for switch
statements, and it appears SB wasn't ready to handle such a thing.



Hi, Dave,

I suspect we should rather get rid of such loops somehow, i.e. convert
to something else, the loop that never repeats is not really a loop
anyway. AFAICS continue is not supported in switch statements
according to GLSL specs, so the loops generated for switch will never be
repeated. Am I missing something? Even if repeating is possible somehow,
at least we can get rid of the loops that are not repeated.

I think loops are less efficient than other control flow instructions on
r600g hw (at least because they increase stack usage), and possibly on
other hw too.

In fact it seems sb basically gets rid of it already in IR, it just
doesn't know how to translate resulting control flow to ISA, because so
far it only supports specific control flow structure for if-then-else
that was previously preserved during optimizations. I think it may be
not very hard to implement support for that in finalizer, I'll look into
it.



In fact handling that control flow in finalizer is not as easy as I hoped,
probably impossible, at least if we want to make it efficient. I forgot
about the limitations of R600 ISA.

OTOH it seems I've managed to fix the issues with loops, the patch is
attached (it's meant to be used instead of 7b0067d2). There are no piglit
regressions on evergreen, but I didn't test any real apps.


This does seem to fix the problems in piglit, and looks close to what
I was attempting but written by someone who knows what they are doing :-)

What is the sb_sched.cpp change for at the end for?


It fixes those scheduler/regalloc errors for switch tests.

Unfortunately, now I've installed some benchmarks for testing and AFAICS 
this patch breaks at least lightsmark 2008, so it seems the condition 
removed by the patch was there for a reason.


I'll probably try to come up with better fix.

Vadim



Dave.



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-05 Thread Vadim Girlin

On 12/04/2014 01:43 AM, Dave Airlie wrote:

Hi Vadim,

I've been looking with Glenn's help into a bug in sb for a couple of
weeks now triggered by a change in how GLSL generates switch
statements.

I understand you probably aren't too interested in r600g but I believe
I'm hitting a design level problem and I would like some advice.

So it appears that GLSL can create loops that don't repeat for switch
statements, and it appears SB wasn't ready to handle such a thing.


Hi, Dave,

I suspect we should rather get rid of such loops somehow, i.e. convert 
to something else, the loop that never repeats is not really a loop 
anyway. AFAICS continue is not supported in switch statements 
according to GLSL specs, so the loops generated for switch will never be 
repeated. Am I missing something? Even if repeating is possible somehow, 
at least we can get rid of the loops that are not repeated.


I think loops are less efficient than other control flow instructions on 
r600g hw (at least because they increase stack usage), and possibly on 
other hw too.


In fact it seems sb basically gets rid of it already in IR, it just 
doesn't know how to translate resulting control flow to ISA, because so 
far it only supports specific control flow structure for if-then-else 
that was previously preserved during optimizations. I think it may be 
not very hard to implement support for that in finalizer, I'll look into it.




sb has the -is_loop() and it just checks !repeats.empty(), so this
meant in the finalizer code we'd fall into the if statement which
would then assert.

I hacked/fixed (more hacked), this in 7b0067d23a6f64cf83c42e7f11b2cd4100c569fe
which attempts to detect single pass loops and handle things that way.

However this lead to stack depth calculations being incorrectly done,
so I moved the single loop detect into the is_loop check, (see
attached patch).

This fixes the rendering in some places, but lead to a regression in
tests/shaders/glsl-vs-continue-in-switch-in-do-while.shader_test
error at : PHI t76||FP@R3.x,   t128||FP@R3.x, t115||FP@R3.x,
t102||FP@R3.x, t89||FP@R3.x : expected
 operand value t115||FP@R3.x, gpr contains t17||FP@R3.x
error at : PHI t76||FP@R3.x,   t128||FP@R3.x, t115||FP@R3.x,
t102||FP@R3.x, t89||FP@R3.x : expected
 operand value t102||FP@R3.x, gpr contains t17||FP@R3.x

Now Glenn suspected this was due to the is_loop check in
sb_shader.cpp:create_bbs,
and changing that check to only detect repeating loops removes that issue,
but introduces stack sizing issues again, resulting in lockups/random rendering.

So I just want to ask had you considered single loops with an always
break in sb design,


I didn't see such loops with any test cases, so I didn't even think 
about it.



and perhaps some idea where things are going so wrong with the
register alloc above.


Not sure, but as long as the only repeat node is optimized away in 
bc_parser because it's useless due to unconditional break, I suspect it 
may be not easy to make all other code think that it's still a loop.


I've tried a quick fix to not optimize the repeat away for such loops, 
but it results in other issues, probably it will require handling this 
as a special case in other places, so it doesn't look like a good idea 
either.


I'll try to implement the solution that I described above, that is, 
translate resulting control flow back to ISA. If it won't be too much 
work, it's probably the best way and it won't use loop instructions in 
the end.




I suspect I'll keep digging into this, but its getting to the edges of
the brain space/time I can find!

Dave.



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-05 Thread Matt Turner
On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com wrote:
 I suspect we should rather get rid of such loops somehow, i.e. convert to
 something else, the loop that never repeats is not really a loop anyway.
 AFAICS continue is not supported in switch statements according to GLSL
 specs, so the loops generated for switch will never be repeated. Am I
 missing something? Even if repeating is possible somehow, at least we can
 get rid of the loops that are not repeated.

I don't think that's true. I don't see anything in the spec that would
lead me to believe continue cannot occur in a switch statement.

In fact, we have some relatively complicated shaders that have a
continue in a switch. See
tests/shaders/glsl-fs-continue-in-switch-in-do-while.shader_test
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-05 Thread Vadim Girlin

On 12/06/2014 07:50 AM, Matt Turner wrote:

On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com wrote:

I suspect we should rather get rid of such loops somehow, i.e. convert to
something else, the loop that never repeats is not really a loop anyway.
AFAICS continue is not supported in switch statements according to GLSL
specs, so the loops generated for switch will never be repeated. Am I
missing something? Even if repeating is possible somehow, at least we can
get rid of the loops that are not repeated.


I don't think that's true. I don't see anything in the spec that would
lead me to believe continue cannot occur in a switch statement.


I've double-checked some versions of GLSL spec (1.30, 1.50, 3.30, 4.40) 
and all of them say the same (section 6.4 Jumps):


The continue jump is used only in loops.


In fact, we have some relatively complicated shaders that have a
continue in a switch. See
tests/shaders/glsl-fs-continue-in-switch-in-do-while.shader_test



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-05 Thread Matt Turner
On Fri, Dec 5, 2014 at 8:56 PM, Vadim Girlin vadimgir...@gmail.com wrote:
 On 12/06/2014 07:50 AM, Matt Turner wrote:

 On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com
 wrote:

 I suspect we should rather get rid of such loops somehow, i.e. convert to
 something else, the loop that never repeats is not really a loop anyway.
 AFAICS continue is not supported in switch statements according to GLSL
 specs, so the loops generated for switch will never be repeated. Am I
 missing something? Even if repeating is possible somehow, at least we can
 get rid of the loops that are not repeated.


 I don't think that's true. I don't see anything in the spec that would
 lead me to believe continue cannot occur in a switch statement.


 I've double-checked some versions of GLSL spec (1.30, 1.50, 3.30, 4.40) and
 all of them say the same (section 6.4 Jumps):

 The continue jump is used only in loops.

Sure, but isn't the continue below in a loop?

do {
   switch (...) {
   case ...:
  continue;
   }
} while (...);

The grammar is pretty unambiguous.

 jump_statement:
CONTINUE SEMICOLON
BREAK SEMICOLON
RETURN SEMICOLON
RETURN expression SEMICOLON
DISCARD SEMICOLON // Fragment shader only.

If continue can't be in a switch, neither can break. :)
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600/sb loop issue

2014-12-05 Thread Vadim Girlin

On 12/06/2014 08:01 AM, Matt Turner wrote:

On Fri, Dec 5, 2014 at 8:56 PM, Vadim Girlin vadimgir...@gmail.com wrote:

On 12/06/2014 07:50 AM, Matt Turner wrote:


On Fri, Dec 5, 2014 at 8:13 PM, Vadim Girlin vadimgir...@gmail.com
wrote:


I suspect we should rather get rid of such loops somehow, i.e. convert to
something else, the loop that never repeats is not really a loop anyway.
AFAICS continue is not supported in switch statements according to GLSL
specs, so the loops generated for switch will never be repeated. Am I
missing something? Even if repeating is possible somehow, at least we can
get rid of the loops that are not repeated.



I don't think that's true. I don't see anything in the spec that would
lead me to believe continue cannot occur in a switch statement.



I've double-checked some versions of GLSL spec (1.30, 1.50, 3.30, 4.40) and
all of them say the same (section 6.4 Jumps):

The continue jump is used only in loops.


Sure, but isn't the continue below in a loop?

do {
switch (...) {
case ...:
   continue;
}
} while (...);



Ah, now I see, you're right. I just was mostly thinking about that loop 
that is created for a switch in IR, not about source, and somehow 
confused these things.


Thanks for pointing that out. Hopefully such cases won't complicate the 
problem in sb even more, need to check those tests.



The grammar is pretty unambiguous.

  jump_statement:
 CONTINUE SEMICOLON
 BREAK SEMICOLON
 RETURN SEMICOLON
 RETURN expression SEMICOLON
 DISCARD SEMICOLON // Fragment shader only.

If continue can't be in a switch, neither can break. :)



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600/sb loop issue

2014-12-03 Thread Dave Airlie
Hi Vadim,

I've been looking with Glenn's help into a bug in sb for a couple of
weeks now triggered by a change in how GLSL generates switch
statements.

I understand you probably aren't too interested in r600g but I believe
I'm hitting a design level problem and I would like some advice.

So it appears that GLSL can create loops that don't repeat for switch
statements, and it appears SB wasn't ready to handle such a thing.

sb has the -is_loop() and it just checks !repeats.empty(), so this
meant in the finalizer code we'd fall into the if statement which
would then assert.

I hacked/fixed (more hacked), this in 7b0067d23a6f64cf83c42e7f11b2cd4100c569fe
which attempts to detect single pass loops and handle things that way.

However this lead to stack depth calculations being incorrectly done,
so I moved the single loop detect into the is_loop check, (see
attached patch).

This fixes the rendering in some places, but lead to a regression in
tests/shaders/glsl-vs-continue-in-switch-in-do-while.shader_test
error at : PHI t76||FP@R3.x,   t128||FP@R3.x, t115||FP@R3.x,
t102||FP@R3.x, t89||FP@R3.x : expected
operand value t115||FP@R3.x, gpr contains t17||FP@R3.x
error at : PHI t76||FP@R3.x,   t128||FP@R3.x, t115||FP@R3.x,
t102||FP@R3.x, t89||FP@R3.x : expected
operand value t102||FP@R3.x, gpr contains t17||FP@R3.x

Now Glenn suspected this was due to the is_loop check in
sb_shader.cpp:create_bbs,
and changing that check to only detect repeating loops removes that issue,
but introduces stack sizing issues again, resulting in lockups/random rendering.

So I just want to ask had you considered single loops with an always
break in sb design,
and perhaps some idea where things are going so wrong with the
register alloc above.

I suspect I'll keep digging into this, but its getting to the edges of
the brain space/time I can find!

Dave.
From 170184b712d9596f761acdee2c7cff2a2792d937 Mon Sep 17 00:00:00 2001
From: Dave Airlie airl...@redhat.com
Date: Wed, 3 Dec 2014 13:05:18 +1000
Subject: [PATCH] r600g/sb: detect empty once iterated loops

Since GLSL changed to using loops for switches,
we've hit a bug in sb with single execution loops,

I previously attempted to fix this by changing where
we detected loops, however this isn't good enough
as SB gets the stack sizing wrong.

Fix this by checking inside the is_loop for
single execution loops.

This should fix lockups on rv635 and misrenderings
on cayman since the first fix:
7b0067d23a6f64cf83c42e7f11b2cd4100c569fe
fix issues cause by GLSL switching to loops for switch

Signed-off-by: Dave Airlie airl...@redhat.com
---
 src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 22 +++
 src/gallium/drivers/r600/sb/sb_ir.cpp  | 30 ++
 src/gallium/drivers/r600/sb/sb_ir.h|  3 +--
 3 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
index 0fa0910..56189c9 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
@@ -46,19 +46,7 @@ int bc_finalizer::run() {
 	for (regions_vec::reverse_iterator I = rv.rbegin(), E = rv.rend(); I != E;
 			++I) {
 		region_node *r = *I;
-		bool is_if = false;
-		assert(r);
-
-		assert(r-first);
-		if (r-first-is_container()) {
-			container_node *repdep1 = static_castcontainer_node*(r-first);
-			assert(repdep1-is_depart() || repdep1-is_repeat());
-			if_node *n_if = static_castif_node*(repdep1-first);
-			if (n_if  n_if-is_if())
-is_if = true;
-		}
-
-		if (is_if)
+		if (!r-is_loop())
 			finalize_if(r);
 		else
 			finalize_loop(r);
@@ -121,7 +109,13 @@ void bc_finalizer::finalize_loop(region_node* r) {
 	cf_node *loop_end = sh.create_cf(CF_OP_LOOP_END);
 	bool has_instr = false;
 
-	if (!r-is_loop()) {
+	/*
+	 * if we have repeats then we have instructions,
+	 * if we have no repeats for a single loops,
+	 * check if there are any instructions in the depart
+	 * nodes.
+	 */
+	if (r-repeats.empty()) {
 		for (depart_vec::iterator I = r-departs.begin(), E = r-departs.end();
 		 I != E; ++I) {
 			depart_node *dep = *I;
diff --git a/src/gallium/drivers/r600/sb/sb_ir.cpp b/src/gallium/drivers/r600/sb/sb_ir.cpp
index 5226893..a4c4e83 100644
--- a/src/gallium/drivers/r600/sb/sb_ir.cpp
+++ b/src/gallium/drivers/r600/sb/sb_ir.cpp
@@ -485,6 +485,36 @@ void container_node::collect_stats(node_stats s) {
 	}
 }
 
+bool region_node::is_loop() {
+
+	if (!repeats.empty())
+		return true;
+
+	/*
+	 * single pass loops have no repeats, however we need to detect
+	 * them as loops.
+	 */
+
+	/* if we have no first in the region then it can't be a loop. */
+	if (!first)
+		return false;
+
+	/*
+	 * if the first is a container, see if it has an if node,
+	 * if nodes aren't loops, if there is no if node,
+	 * then this is a single pass loops.
+	 */
+	if (first-is_container()) {
+		container_node 

Re: [Mesa-dev] R600/OpenCL - kernel_param resource

2014-04-11 Thread Tom Stellard
On Thu, Apr 10, 2014 at 03:24:32PM +, Dorrington, Albert wrote:
 I am having an issue with a memory leak in an OpenCL program I am testing.
 In the program I call the same kernel repeatedly, for every pixel in an 
 image. (Probably not the most efficient code, but it is a learning/testing 
 thing.)
 
 One thing in particular I have not yet been able to figure out, is what 
 releases the reference counts for the shader-kernel_param resource created 
 in evergreen_compute_upload_input().
 
 Tracing through the calls:
 evergreen_compute_upload_input()
 evergreen_cs_set_constant_buffer()
 r600_set_constant_buffer()
 
 I can see that if r600_set_constant_buffer() is passed a null 
 pipe_constant_buffer input, that it would reset the stat masks and make the 
 call to pipe_resource_reference() with a NULL, to decrement the count.
 
 But I don't see where that would happen.
 
 I am thinking that perhaps there should be something to release the reference 
 count for that buffer, either after the evergreen_launch_grid() call, or 
 perhaps as the last thing within that call, after the compute_emit_cs() call.
 
 Or, is this call happening somewhere else that I haven't found?


kernel_param is probably the source of the leak, it doesn't look like
we are destroying it anywhere.

-Tom
 
 Thanks,
 Al Dorrington
 Software Engineer Sr
 Lockheed Martin, Mission Systems and Training
 

 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600/OpenCL - kernel_param resource

2014-04-10 Thread Dorrington, Albert
I am having an issue with a memory leak in an OpenCL program I am testing.
In the program I call the same kernel repeatedly, for every pixel in an image. 
(Probably not the most efficient code, but it is a learning/testing thing.)

One thing in particular I have not yet been able to figure out, is what 
releases the reference counts for the shader-kernel_param resource created in 
evergreen_compute_upload_input().

Tracing through the calls:
evergreen_compute_upload_input()
evergreen_cs_set_constant_buffer()
r600_set_constant_buffer()

I can see that if r600_set_constant_buffer() is passed a null 
pipe_constant_buffer input, that it would reset the stat masks and make the 
call to pipe_resource_reference() with a NULL, to decrement the count.

But I don't see where that would happen.

I am thinking that perhaps there should be something to release the reference 
count for that buffer, either after the evergreen_launch_grid() call, or 
perhaps as the last thing within that call, after the compute_emit_cs() call.

Or, is this call happening somewhere else that I haven't found?

Thanks,
Al Dorrington
Software Engineer Sr
Lockheed Martin, Mission Systems and Training

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600+700 geometry shader patch

2014-01-30 Thread Dave Airlie
I've lightly tested this, not piglit strength yet,
and it does require the kernel patch to work.

its also available in a branch in my repo r600-geom-shaders.

Dave.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/SI: New intrinsics for radeonsi geometry shaders

2014-01-24 Thread Tom Stellard
On Fri, Jan 24, 2014 at 03:17:04PM +0900, Michel Dänzer wrote:
 
 The attached patches add two intrinsics to the R600 backend which are
 necessary for geometry shader support in the radeonsi driver.
 

Patch 1 and v2 of Patch 2 are:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

-Tom

 
 -- 
 Earthling Michel Dänzer|  http://www.amd.com
 Libre software enthusiast  |Mesa and X developer

 From 8feb7201ac894e5a6731a157020ac807936f584d Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
 Date: Fri, 29 Nov 2013 18:21:41 +0900
 Subject: [PATCH 1/2] R600/SI: Add intrinsic for S_SENDMSG instruction
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 
 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
  lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp | 31 
 +++
  lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h   |  1 +
  lib/Target/R600/SIInsertWaits.cpp |  6 +
  lib/Target/R600/SIInstructions.td | 16 ++--
  lib/Target/R600/SIIntrinsics.td   |  2 ++
  test/CodeGen/R600/llvm.SI.sendmsg.ll  | 21 +++
  6 files changed, 75 insertions(+), 2 deletions(-)
  create mode 100644 test/CodeGen/R600/llvm.SI.sendmsg.ll
 
 diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp 
 b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
 index 99e1377..7105879 100644
 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
 +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
 @@ -316,6 +316,37 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, 
 unsigned OpNo,
}
  }
  
 +void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
 + raw_ostream O) {
 +  unsigned SImm16 = MI-getOperand(OpNo).getImm();
 +  unsigned Msg = SImm16  0xF;
 +  if (Msg == 2 || Msg == 3) {
 +unsigned Op = (SImm16  4)  0xF;
 +if (Msg == 3)
 +  O  Gs_done(;
 +else
 +  O  Gs(;
 +if (Op == 0) {
 +  O  nop;
 +} else {
 +  unsigned Stream = (SImm16  8)  0x3;
 +  if (Op == 1)
 + O  cut;
 +  else if (Op == 2)
 + O  emit;
 +  else if (Op == 3)
 + O  emit-cut;
 +  O   stream   Stream;
 +}
 +O  ), [m0] ;
 +  } else if (Msg == 1)
 +O  interrupt ;
 +  else if (Msg == 15)
 +O  system ;
 +  else
 +O  unknown(  Msg  ) ;
 +}
 +
  void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
raw_ostream O) {
// Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
 diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h 
 b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
 index 77af942..2876dd2 100644
 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
 +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
 @@ -53,6 +53,7 @@ private:
void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream O);
void printCT(const MCInst *MI, unsigned OpNo, raw_ostream O);
void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream O);
 +  void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream O);
void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream O);
  };
  
 diff --git a/lib/Target/R600/SIInsertWaits.cpp 
 b/lib/Target/R600/SIInsertWaits.cpp
 index 7ef662e..695ec40 100644
 --- a/lib/Target/R600/SIInsertWaits.cpp
 +++ b/lib/Target/R600/SIInsertWaits.cpp
 @@ -314,6 +314,12 @@ Counters SIInsertWaits::handleOperands(MachineInstr MI) 
 {
  
Counters Result = ZeroCounts;
  
 +  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
 +  // but we also want to wait for any other outstanding transfers before
 +  // signalling other hardware blocks
 +  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
 +return LastIssued;
 +
// For each register affected by this
// instruction increase the result sequence
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 diff --git a/lib/Target/R600/SIInstructions.td 
 b/lib/Target/R600/SIInstructions.td
 index 3baa4cd..c0ad398 100644
 --- a/lib/Target/R600/SIInstructions.td
 +++ b/lib/Target/R600/SIInstructions.td
 @@ -22,6 +22,10 @@ def InterpSlot : Operandi32 {
let PrintMethod = printInterpSlot;
  }
  
 +def SendMsgImm : Operandi32 {
 +  let PrintMethod = printSendMsg;
 +}
 +
  def isSI : PredicateSubtarget.getGeneration() 
= AMDGPUSubtarget::SOUTHERN_ISLANDS;
  
 @@ -826,17 +830,25 @@ def S_BARRIER : SOPP 0x000a, (ins), S_BARRIER,
  def S_WAITCNT : SOPP 0x000c, (ins WAIT_FLAG:$simm16), S_WAITCNT 
 $simm16,
[]
  ;
 -} // End hasSideEffects
  //def S_SETHALT : SOPP_ 0x000d, S_SETHALT, [];
  //def S_SLEEP : SOPP_ 0x000e, S_SLEEP, [];
  //def S_SETPRIO : SOPP_ 0x000f, S_SETPRIO, [];
 -//def S_SENDMSG : SOPP_ 0x0010, S_SENDMSG, [];
 +
 +let Uses = [EXEC] in {
 +  

[Mesa-dev] R600/SI: New intrinsics for radeonsi geometry shaders

2014-01-23 Thread Michel Dänzer

The attached patches add two intrinsics to the R600 backend which are
necessary for geometry shader support in the radeonsi driver.


-- 
Earthling Michel Dänzer|  http://www.amd.com
Libre software enthusiast  |Mesa and X developer
From 8feb7201ac894e5a6731a157020ac807936f584d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Fri, 29 Nov 2013 18:21:41 +0900
Subject: [PATCH 1/2] R600/SI: Add intrinsic for S_SENDMSG instruction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp | 31 +++
 lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h   |  1 +
 lib/Target/R600/SIInsertWaits.cpp |  6 +
 lib/Target/R600/SIInstructions.td | 16 ++--
 lib/Target/R600/SIIntrinsics.td   |  2 ++
 test/CodeGen/R600/llvm.SI.sendmsg.ll  | 21 +++
 6 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/R600/llvm.SI.sendmsg.ll

diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 99e1377..7105879 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -316,6 +316,37 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
+ raw_ostream O) {
+  unsigned SImm16 = MI-getOperand(OpNo).getImm();
+  unsigned Msg = SImm16  0xF;
+  if (Msg == 2 || Msg == 3) {
+unsigned Op = (SImm16  4)  0xF;
+if (Msg == 3)
+  O  Gs_done(;
+else
+  O  Gs(;
+if (Op == 0) {
+  O  nop;
+} else {
+  unsigned Stream = (SImm16  8)  0x3;
+  if (Op == 1)
+	O  cut;
+  else if (Op == 2)
+	O  emit;
+  else if (Op == 3)
+	O  emit-cut;
+  O   stream   Stream;
+}
+O  ), [m0] ;
+  } else if (Msg == 1)
+O  interrupt ;
+  else if (Msg == 15)
+O  system ;
+  else
+O  unknown(  Msg  ) ;
+}
+
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
   raw_ostream O) {
   // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 77af942..2876dd2 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -53,6 +53,7 @@ private:
   void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream O);
   void printCT(const MCInst *MI, unsigned OpNo, raw_ostream O);
   void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream O);
+  void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream O);
   void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream O);
 };
 
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 7ef662e..695ec40 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -314,6 +314,12 @@ Counters SIInsertWaits::handleOperands(MachineInstr MI) {
 
   Counters Result = ZeroCounts;
 
+  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+  // but we also want to wait for any other outstanding transfers before
+  // signalling other hardware blocks
+  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
+return LastIssued;
+
   // For each register affected by this
   // instruction increase the result sequence
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 3baa4cd..c0ad398 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -22,6 +22,10 @@ def InterpSlot : Operandi32 {
   let PrintMethod = printInterpSlot;
 }
 
+def SendMsgImm : Operandi32 {
+  let PrintMethod = printSendMsg;
+}
+
 def isSI : PredicateSubtarget.getGeneration() 
   = AMDGPUSubtarget::SOUTHERN_ISLANDS;
 
@@ -826,17 +830,25 @@ def S_BARRIER : SOPP 0x000a, (ins), S_BARRIER,
 def S_WAITCNT : SOPP 0x000c, (ins WAIT_FLAG:$simm16), S_WAITCNT $simm16,
   []
 ;
-} // End hasSideEffects
 //def S_SETHALT : SOPP_ 0x000d, S_SETHALT, [];
 //def S_SLEEP : SOPP_ 0x000e, S_SLEEP, [];
 //def S_SETPRIO : SOPP_ 0x000f, S_SETPRIO, [];
-//def S_SENDMSG : SOPP_ 0x0010, S_SENDMSG, [];
+
+let Uses = [EXEC] in {
+  def S_SENDMSG : SOPP 0x0010, (ins SendMsgImm:$simm16, M0Reg:$m0), S_SENDMSG $simm16,
+  [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+   {
+let DisableEncoding = $m0;
+  }
+} // End Uses = [EXEC]
+
 //def S_SENDMSGHALT : SOPP_ 0x0011, S_SENDMSGHALT, [];
 //def S_TRAP : SOPP_ 0x0012, 

[Mesa-dev] r600 Evergreen Compute and compute_memory_grow_pool

2014-01-20 Thread Hrustich, John
The compute memory pool used by the gallium r600 driver seems to be 
problematic.  The pool looks to be a single radeon buffer object.  There could 
be multiple maps set up into that single buffer object.  If there is a need to 
grow the pool, then the resource associated with the buffer object is 
destroyed, which results in all of the maps for that buffer object also being 
destroyed.  When the new larger pool is created, the pointers that the 
application has to the mapped region are no longer valid.

A temporary work-around would appear to be to make sure that the buffer pool is 
large enough that there isn't a need to grow the pool once any maps into it are 
created.  A longer term solution seems much harder.  Even if the maps could all 
be precisely recreated into the newly allocated buffer object, there would be a 
period of time when the pointers held by the application would be invalid.

John Hrustich
LM Master Software Architect, Mission Systems and Training
Lockheed Martin Corporation
1801 State Route 17C, Mail Drop 0220, Owego, NY 13827
O 607-751-4206 | E john.hrust...@lmco.com
100 Years of Accelerating Tomorrow

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 Evergreen Compute and compute_memory_grow_pool

2014-01-20 Thread Tom Stellard
On Mon, Jan 20, 2014 at 09:32:11PM +, Hrustich, John wrote:
 The compute memory pool used by the gallium r600 driver seems to be 
 problematic.  The pool looks to be a single radeon buffer object.  There 
 could be multiple maps set up into that single buffer object.  If there is a 
 need to grow the pool, then the resource associated with the buffer object is 
 destroyed, which results in all of the maps for that buffer object also being 
 destroyed.  When the new larger pool is created, the pointers that the 
 application has to the mapped region are no longer valid.
 
 A temporary work-around would appear to be to make sure that the buffer pool 
 is large enough that there isn't a need to grow the pool once any maps into 
 it are created.  A longer term solution seems much harder.  Even if the maps 
 could all be precisely recreated into the newly allocated buffer object, 
 there would be a period of time when the pointers held by the application 
 would be invalid.


This is just one of the many problems with the compute memory pool.  It
would be good to have some piglit tests for the use case you described.

I think the compute code in r600g has stabilized enough now that we
could consider replacing the memory pool with something else.  I'm open
to suggestions if you have any ideas.

-Tom
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/SI: Support for local memory and derivatives

2013-07-10 Thread Michel Dänzer
On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote:
 On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote:
  
  These patches implement enough of local memory support to allow radeonsi
  to use that for computing derivatives, as suggested by Tom.
  
  They also almost allow test/CodeGen/R600/local-memory.ll to generate
  code for SI. Right now it still fails because it tries to copy a VGPR to
  an SGPR, which is not possible.
 
 Can you add some lit tests for these new intrinsics

Done, updated patches attached.


 and also add CHECK lines for SI to the existing local-memory.ll test.

Can't do that while it still fails to generate SI code. Should I commit
the other patches anyway, which are only necessary for that test?


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
From 3572bab6a6b5c967d19add0b0497a96123754ec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Thu, 21 Feb 2013 16:12:45 +0100
Subject: [PATCH v2 1/4] R600/SI: Add intrinsics for texture sampling with user
 derivatives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---

v2: Add lit test

 lib/Target/R600/SIInstructions.td|   7 +-
 lib/Target/R600/SIIntrinsics.td  |   1 +
 test/CodeGen/R600/llvm.SI.sampled.ll | 140 +++
 3 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/R600/llvm.SI.sampled.ll

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9c96c08..c9eac7d 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027;
 def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029;
-//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a;
+def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b;
 def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L;
 def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B;
@@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type {
   def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type;
   def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
   def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
+
+  def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
+  def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
+  def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
+  def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
 }
 
 defm : SamplePatternsv2i32;
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 224cd2f..d2643e0 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in {
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
   def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem];
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
new file mode 100644
index 000..71b8ef5
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -0,0 +1,140 @@
+;RUN: llc  %s -march=r600 -mcpu=verde | FileCheck %s
+
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
+;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5
+;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9
+;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12
+;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
+;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
+;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
+;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
+
+define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
+   %v1 = insertelement 4 x i32 undef, i32 %a1, i32 0
+   %v2 = insertelement 4 x i32 undef, i32 %a1, i32 1
+   %v3 = insertelement 4 x i32 undef, i32 %a1, i32 2
+   %v4 = insertelement 4 x i32 undef, i32 %a1, i32 3

Re: [Mesa-dev] R600/SI: Support for local memory and derivatives

2013-07-10 Thread Tom Stellard
On Wed, Jul 10, 2013 at 12:32:25PM +0200, Michel Dänzer wrote:
 On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote:
  On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote:
   
   These patches implement enough of local memory support to allow radeonsi
   to use that for computing derivatives, as suggested by Tom.
   
   They also almost allow test/CodeGen/R600/local-memory.ll to generate
   code for SI. Right now it still fails because it tries to copy a VGPR to
   an SGPR, which is not possible.
  
  Can you add some lit tests for these new intrinsics
 
 Done, updated patches attached.
 
 
  and also add CHECK lines for SI to the existing local-memory.ll test.
 
 Can't do that while it still fails to generate SI code. Should I commit
 the other patches anyway, which are only necessary for that test?
 


Can you add a TODO comment to that test for adding SI checks?

With that change, the patches are:

Reviewed-by: Tom Stellard thomas.stell...@amd.com
 
 -- 
 Earthling Michel Dänzer   |   http://www.amd.com
 Libre software enthusiast |  Debian, X and DRI developer

 From 3572bab6a6b5c967d19add0b0497a96123754ec2 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
 Date: Thu, 21 Feb 2013 16:12:45 +0100
 Subject: [PATCH v2 1/4] R600/SI: Add intrinsics for texture sampling with user
  derivatives
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 
 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
 
 v2: Add lit test
 
  lib/Target/R600/SIInstructions.td|   7 +-
  lib/Target/R600/SIIntrinsics.td  |   1 +
  test/CodeGen/R600/llvm.SI.sampled.ll | 140 
 +++
  3 files changed, 147 insertions(+), 1 deletion(-)
  create mode 100644 test/CodeGen/R600/llvm.SI.sampled.ll
 
 diff --git a/lib/Target/R600/SIInstructions.td 
 b/lib/Target/R600/SIInstructions.td
 index 9c96c08..c9eac7d 100644
 --- a/lib/Target/R600/SIInstructions.td
 +++ b/lib/Target/R600/SIInstructions.td
 @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, 
 IMAGE_SAMPLE_B;
  //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027;
  def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C;
  //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029;
 -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a;
 +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D;
  //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 
 0x002b;
  def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L;
  def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B;
 @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type {
def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type;
def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, 
 addr_type;
 +
 +  def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
 +  def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
 +  def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
 +  def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, 
 addr_type;
  }
  
  defm : SamplePatternsv2i32;
 diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
 index 224cd2f..d2643e0 100644
 --- a/lib/Target/R600/SIIntrinsics.td
 +++ b/lib/Target/R600/SIIntrinsics.td
 @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in {
  
def int_SI_sample : Sample;
def int_SI_sampleb : Sample;
 +  def int_SI_sampled : Sample;
def int_SI_samplel : Sample;
  
def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, 
 llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem];
 diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll 
 b/test/CodeGen/R600/llvm.SI.sampled.ll
 new file mode 100644
 index 000..71b8ef5
 --- /dev/null
 +++ b/test/CodeGen/R600/llvm.SI.sampled.ll
 @@ -0,0 +1,140 @@
 +;RUN: llc  %s -march=r600 -mcpu=verde | FileCheck %s
 +
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5
 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9
 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10
 +;CHECK: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12
 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
 +;CHECK: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
 +;CHECK: IMAGE_SAMPLE_D 

Re: [Mesa-dev] R600/SI: Support for local memory and derivatives

2013-07-10 Thread Michel Dänzer
On Mit, 2013-07-10 at 08:15 -0700, Tom Stellard wrote:
 On Wed, Jul 10, 2013 at 12:32:25PM +0200, Michel Dänzer wrote:
  On Fre, 2013-06-28 at 14:37 -0700, Tom Stellard wrote:
   On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote:

These patches implement enough of local memory support to allow radeonsi
to use that for computing derivatives, as suggested by Tom.

They also almost allow test/CodeGen/R600/local-memory.ll to generate
code for SI. Right now it still fails because it tries to copy a VGPR to
an SGPR, which is not possible.
   
   Can you add some lit tests for these new intrinsics
  
  Done, updated patches attached.
  
  
   and also add CHECK lines for SI to the existing local-memory.ll test.
  
  Can't do that while it still fails to generate SI code. Should I commit
  the other patches anyway, which are only necessary for that test?
 
 Can you add a TODO comment to that test for adding SI checks?
 
 With that change, the patches are:
 
 Reviewed-by: Tom Stellard thomas.stell...@amd.com

Thanks, I managed to enable basic lit testing after all, see the
attached patches 4 and 5.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
From 0f11058228a2c6504ed78f9856e6de3f8af0c0e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Wed, 19 Jun 2013 11:01:00 +0200
Subject: [PATCH 4/5] R600/SI: Add pattern for the AMDGPU.barrier.local
 intrinsic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

lit test coverage to follow in the next commit.

Reviewed-by: Tom Stellard thomas.stell...@amd.com
Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 lib/Target/R600/SIInstructions.td | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 61755b4..30f2a4a 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -774,8 +774,17 @@ def S_CBRANCH_EXECNZ : SOPP 
 } // End isBranch = 1
 } // End isTerminator = 1
 
-//def S_BARRIER : SOPP_ 0x000a, S_BARRIER, [];
 let hasSideEffects = 1 in {
+def S_BARRIER : SOPP 0x000a, (ins), S_BARRIER,
+  [(int_AMDGPU_barrier_local)]
+ {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 def S_WAITCNT : SOPP 0x000c, (ins i32imm:$simm16), S_WAITCNT $simm16,
   []
 ;
-- 
1.8.3.2

From 09715a4574c2e35b02176516f542bc0d1d0dc132 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Mon, 17 Jun 2013 12:21:29 +0200
Subject: [PATCH v2 5/5] R600/SI: Initial local memory support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enough for the radeonsi driver to use it for calculating derivatives.

Reviewed-by: Tom Stellard thomas.stell...@amd.com
Signed-off-by: Michel Dänzer michel.daen...@amd.com
---

v2: Enable some lit testing of local memory on SI.

 lib/Target/R600/AMDGPUAsmPrinter.cpp  |  7 +++
 lib/Target/R600/AMDGPUISelLowering.cpp|  4 +-
 lib/Target/R600/R600ISelLowering.cpp  |  2 +
 lib/Target/R600/SIDefines.h   |  4 ++
 lib/Target/R600/SIISelLowering.cpp|  5 ++
 lib/Target/R600/SIInstructions.td | 15 ++
 test/CodeGen/R600/local-memory-two-objects.ll | 51 
 test/CodeGen/R600/local-memory.ll | 67 ++-
 8 files changed, 100 insertions(+), 55 deletions(-)
 create mode 100644 test/CodeGen/R600/local-memory-two-objects.ll

diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 996d2a6..e039b77 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -233,7 +233,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction MF) {
 
   OutStreamer.EmitIntValue(RsrcReg, 4);
   OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
+
+  if (MFI-ShaderType == ShaderType::COMPUTE) {
+OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(RoundUpToAlignment(MFI-LDSSize, 256)  8), 4);
+  }
   if (MFI-ShaderType == ShaderType::PIXEL) {
+OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(RoundUpToAlignment(MFI-LDSSize, 256)  8), 4);
 OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
 OutStreamer.EmitIntValue(MFI-PSInputAddr, 4);
   }
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 4019a1f..7fad3bb 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -72,8 +72,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) :
   

Re: [Mesa-dev] R600/SI: Initial double precision support for Radeon SI

2013-07-09 Thread Niels Ole Salscheider
Hi Tom,

 All these patches look good to me, but #2 and #6 should have a test case
 with them.  If you resubmit these patches with test cases, I will push the
 entire series.

I have attached an updated patchset. I have added a test case to patch #2 and 
#6. I have also replaced the scalar move in patch #2 by a vector move since 
there is probably no point in having a floating point value in a scalar 
register.

Kind regards,

OleFrom 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001
From: Niels Ole Salscheider niels_...@salscheider-online.de
Date: Sat, 1 Jun 2013 16:48:56 +0200
Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI

---
 lib/Target/R600/AMDGPUISelLowering.cpp |  6 ++
 lib/Target/R600/SIISelLowering.cpp |  1 +
 lib/Target/R600/SIInstructions.td  | 30 +-
 test/CodeGen/R600/fadd64.ll| 13 +
 test/CodeGen/R600/fdiv64.ll| 14 ++
 test/CodeGen/R600/fmul64.ll| 13 +
 test/CodeGen/R600/load64.ll| 20 
 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
 create mode 100644 test/CodeGen/R600/fadd64.ll
 create mode 100644 test/CodeGen/R600/fdiv64.ll
 create mode 100644 test/CodeGen/R600/fmul64.ll
 create mode 100644 test/CodeGen/R600/load64.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 4019a1f..5f3d496 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) :
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 9d4cfef..0d17a12 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
 
   addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass);
   addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass);
 
   addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass);
   addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9c96c08..b956387 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 
   [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
 ;
 defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, [];
-defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, [];
+defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64,
+  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+;
 defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, [];
 defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, [];
 defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, [];
@@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64,
 ;
 def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, [];
 
+let isCommutable = 1 in {
+
 def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, [];
 def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, [];
 def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, [];
 def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, [];
+
+} // isCommutable = 1
+
+def : Pat 
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+;
+
+def : Pat  
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+;
+
 def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, [];
 
 let isCommutable = 1 in {
@@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32;
 def : BitConvert f32, i32, SReg_32;
 def : BitConvert f32, i32, VReg_32;
 
+def : BitConvert i64, f64, VReg_64;
+
+def : BitConvert f64, i64, VReg_64;
+
 /** === **/
 /** Src  Dst modifiers **/
 /** === **/
@@ -1505,6 +1526,11 @@ def : Pat
   (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
 ;
 
+def : Pat
+  (fdiv f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+;
+
 def : Pat 
   (fcos f32:$src0),
   (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
@@ -1634,6 +1660,8 @@ multiclass MUBUFLoad_Pattern MUBUF Instr_ADDR64, ValueType vt,
   ;
 }
 
+defm : MUBUFLoad_Pattern 

Re: [Mesa-dev] R600/SI: Initial double precision support for Radeon SI

2013-07-08 Thread Tom Stellard
On Tue, Jul 02, 2013 at 10:44:10AM +0200, Niels Ole Salscheider wrote:
 Hi,
 
 the attached patches add initial support for double precision operations on 
 Southern Islands cards.
 
 Some expressions containing multiple double precision kernel arguments cause 
 llvm to run until all memory is used - but I do not (yet) know why.
 It works fine as long as I pass pointers to double values.
 

I may have an idea about why this is happening.  Could you file a bug
report and attach an LLVM IR test case?

All these patches look good to me, but #2 and #6 should have a test case
with them.  If you resubmit these patches with test cases, I will push the
entire series.

Nice work!

-Tom


 Regards,
 
 Ole

 From 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001
 From: Niels Ole Salscheider niels_...@salscheider-online.de
 Date: Sat, 1 Jun 2013 16:48:56 +0200
 Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI
 
 ---
  lib/Target/R600/AMDGPUISelLowering.cpp |  6 ++
  lib/Target/R600/SIISelLowering.cpp |  1 +
  lib/Target/R600/SIInstructions.td  | 30 +-
  test/CodeGen/R600/fadd64.ll| 13 +
  test/CodeGen/R600/fdiv64.ll| 14 ++
  test/CodeGen/R600/fmul64.ll| 13 +
  test/CodeGen/R600/load64.ll| 20 
  7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
  create mode 100644 test/CodeGen/R600/fadd64.ll
  create mode 100644 test/CodeGen/R600/fdiv64.ll
  create mode 100644 test/CodeGen/R600/fmul64.ll
  create mode 100644 test/CodeGen/R600/load64.ll
 
 diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
 b/lib/Target/R600/AMDGPUISelLowering.cpp
 index 4019a1f..5f3d496 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.cpp
 +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
 @@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine 
 TM) :
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
  
 +  setOperationAction(ISD::STORE, MVT::f64, Promote);
 +  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 +
setOperationAction(ISD::LOAD, MVT::f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
  
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
  
 +  setOperationAction(ISD::LOAD, MVT::f64, Promote);
 +  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 +
setOperationAction(ISD::MUL, MVT::i64, Expand);
  
setOperationAction(ISD::UDIV, MVT::i32, Expand);
 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index 9d4cfef..0d17a12 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
  
addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass);
 +  addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass);
  
addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass);
 diff --git a/lib/Target/R600/SIInstructions.td 
 b/lib/Target/R600/SIInstructions.td
 index 9c96c08..b956387 100644
 --- a/lib/Target/R600/SIInstructions.td
 +++ b/lib/Target/R600/SIInstructions.td
 @@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 
[(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
  ;
  defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, [];
 -defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, [];
 +defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64,
 +  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
 +;
  defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, [];
  defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, [];
  defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, [];
 @@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, 
 V_LSHR_B64,
  ;
  def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, [];
  
 +let isCommutable = 1 in {
 +
  def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, [];
  def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, [];
  def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, [];
  def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, [];
 +
 +} // isCommutable = 1
 +
 +def : Pat 
 +  (fadd f64:$src0, f64:$src1),
 +  (V_ADD_F64 $src0, $src1, (i64 0))
 +;
 +
 +def : Pat  
 +  (fmul f64:$src0, f64:$src1),
 +  (V_MUL_F64 $src0, $src1, (i64 0))
 +;
 +
  def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, [];
  
  let isCommutable = 1 in {
 @@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32;
  def : BitConvert f32, i32, SReg_32;
  def : BitConvert f32, i32, VReg_32;
  
 +def : BitConvert i64, f64, VReg_64;
 +
 +def : BitConvert f64, i64, VReg_64;
 +
  /** === **/
  /** Src  Dst modifiers **/
  /** === **/
 @@ -1505,6 +1526,11 @@ def : Pat
 

[Mesa-dev] R600/SI: Initial double precision support for Radeon SI

2013-07-02 Thread Niels Ole Salscheider
Hi,

the attached patches add initial support for double precision operations on 
Southern Islands cards.

Some expressions containing multiple double precision kernel arguments cause 
llvm to run until all memory is used - but I do not (yet) know why.
It works fine as long as I pass pointers to double values.

Regards,

OleFrom 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001
From: Niels Ole Salscheider niels_...@salscheider-online.de
Date: Sat, 1 Jun 2013 16:48:56 +0200
Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI

---
 lib/Target/R600/AMDGPUISelLowering.cpp |  6 ++
 lib/Target/R600/SIISelLowering.cpp |  1 +
 lib/Target/R600/SIInstructions.td  | 30 +-
 test/CodeGen/R600/fadd64.ll| 13 +
 test/CodeGen/R600/fdiv64.ll| 14 ++
 test/CodeGen/R600/fmul64.ll| 13 +
 test/CodeGen/R600/load64.ll| 20 
 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
 create mode 100644 test/CodeGen/R600/fadd64.ll
 create mode 100644 test/CodeGen/R600/fdiv64.ll
 create mode 100644 test/CodeGen/R600/fmul64.ll
 create mode 100644 test/CodeGen/R600/load64.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 4019a1f..5f3d496 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) :
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 9d4cfef..0d17a12 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
 
   addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass);
   addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass);
 
   addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass);
   addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9c96c08..b956387 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 
   [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
 ;
 defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, [];
-defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, [];
+defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64,
+  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+;
 defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, [];
 defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, [];
 defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, [];
@@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64,
 ;
 def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, [];
 
+let isCommutable = 1 in {
+
 def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, [];
 def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, [];
 def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, [];
 def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, [];
+
+} // isCommutable = 1
+
+def : Pat 
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+;
+
+def : Pat  
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+;
+
 def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, [];
 
 let isCommutable = 1 in {
@@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32;
 def : BitConvert f32, i32, SReg_32;
 def : BitConvert f32, i32, VReg_32;
 
+def : BitConvert i64, f64, VReg_64;
+
+def : BitConvert f64, i64, VReg_64;
+
 /** === **/
 /** Src  Dst modifiers **/
 /** === **/
@@ -1505,6 +1526,11 @@ def : Pat
   (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
 ;
 
+def : Pat
+  (fdiv f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+;
+
 def : Pat 
   (fcos f32:$src0),
   (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
@@ -1634,6 +1660,8 @@ multiclass MUBUFLoad_Pattern MUBUF Instr_ADDR64, ValueType vt,
   ;
 }
 
+defm : MUBUFLoad_Pattern BUFFER_LOAD_DWORDX2_ADDR64, i64,
+  global_load, constant_load;
 defm : MUBUFLoad_Pattern 

Re: [Mesa-dev] R600/SI: Support for local memory and derivatives

2013-06-28 Thread Tom Stellard
On Wed, Jun 19, 2013 at 06:28:21PM +0200, Michel Dänzer wrote:
 
 These patches implement enough of local memory support to allow radeonsi
 to use that for computing derivatives, as suggested by Tom.
 
 They also almost allow test/CodeGen/R600/local-memory.ll to generate
 code for SI. Right now it still fails because it tries to copy a VGPR to
 an SGPR, which is not possible.
 


Can you add some lit tests for these new intrinsics and also add CHECK
lines for SI to the existing local-memory.ll test.

With the tests added, these patches are:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

 -- 
 Earthling Michel Dänzer   |   http://www.amd.com
 Libre software enthusiast |  Debian, X and DRI developer

 From f4ca359c4536aa53122b654196f2e007d50976f8 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
 Date: Thu, 21 Feb 2013 16:12:45 +0100
 Subject: [PATCH 1/6] R600/SI: Add intrinsics for texture sampling with user
  derivatives
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 
 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
  lib/Target/R600/SIInstructions.td | 7 ++-
  lib/Target/R600/SIIntrinsics.td   | 1 +
  2 files changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/lib/Target/R600/SIInstructions.td 
 b/lib/Target/R600/SIInstructions.td
 index 9c96c08..c9eac7d 100644
 --- a/lib/Target/R600/SIInstructions.td
 +++ b/lib/Target/R600/SIInstructions.td
 @@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, 
 IMAGE_SAMPLE_B;
  //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027;
  def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C;
  //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029;
 -//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a;
 +def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D;
  //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 
 0x002b;
  def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L;
  def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B;
 @@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type {
def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type;
def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, 
 addr_type;
 +
 +  def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
 +  def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
 +  def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
 +  def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, 
 addr_type;
  }
  
  defm : SamplePatternsv2i32;
 diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
 index 224cd2f..d2643e0 100644
 --- a/lib/Target/R600/SIIntrinsics.td
 +++ b/lib/Target/R600/SIIntrinsics.td
 @@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in {
  
def int_SI_sample : Sample;
def int_SI_sampleb : Sample;
 +  def int_SI_sampled : Sample;
def int_SI_samplel : Sample;
  
def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, 
 llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem];
 -- 
 1.8.3.1
 

 From 7a0048bb2ab1b661831da2b764bf1a52f66bec15 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
 Date: Thu, 21 Feb 2013 18:51:38 +0100
 Subject: [PATCH v3 2/6] R600/SI: Initial support for LDS/GDS instructions
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 
 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
 
 v3: Drop vdst operand from DS_Store_Helper class, and adapt
 SIInsertWaits::getHwCounts() to handle that. Unfortunately, this seems
 to mess up the asm string output somehow, not sure what's going on
 there.
 
  lib/Target/R600/SIInsertWaits.cpp  |  2 ++
  lib/Target/R600/SIInstrFormats.td  | 24 
  lib/Target/R600/SIInstrInfo.td | 23 +++
  lib/Target/R600/SIInstructions.td  |  3 +++
  lib/Target/R600/SILowerControlFlow.cpp | 16 
  5 files changed, 68 insertions(+)
 
 diff --git a/lib/Target/R600/SIInsertWaits.cpp 
 b/lib/Target/R600/SIInsertWaits.cpp
 index c36e1dc..d31da45 100644
 --- a/lib/Target/R600/SIInsertWaits.cpp
 +++ b/lib/Target/R600/SIInsertWaits.cpp
 @@ -134,6 +134,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr MI) {
if (TSFlags  SIInstrFlags::LGKM_CNT) {
  
  MachineOperand Op = MI.getOperand(0);
 +if (!Op.isReg())
 +  Op = MI.getOperand(1);
  assert(Op.isReg()  First LGKM operand must be a register!);
  
  unsigned Reg = Op.getReg();
 diff --git a/lib/Target/R600/SIInstrFormats.td 
 b/lib/Target/R600/SIInstrFormats.td
 index 51f323d..434aa7e 100644
 --- 

Re: [Mesa-dev] R600 Patches: KCache kernel arguments and 24-bit arithmetic

2013-06-27 Thread Vincent Lejeune
The whole serie is : reviewed-by:Vincent Lejeune vljn at ovi.com

In a future patch we might also remove the ISD::BUILD_VECTOR case in the 
Select() function and use
a tablegen pattern ; I wrote it because we lowered r600.load.input intrinsic to 
a raw register ; however now
we lower it to a copy from a register which should be convertible to a 
REG_SEQUENCE.

Vincent




- Mail original -
 De : Tom Stellard t...@stellard.net
 À : llvm-comm...@cs.uiuc.edu
 Cc : mesa-dev@lists.freedesktop.org
 Envoyé le : Mardi 25 juin 2013 23h37
 Objet : [Mesa-dev] R600 Patches: KCache kernel arguments and 24-bit
arithmetic
 
 Hi,
 
 The attached patches clean up kernel argument handling for both R600 and
 SI and for R600 makes it possible to read arguments through the KCache.
 There are also patches that add support for the 24-bit arithmetic instructions
 (MAD_UINT24, MAD_INT24, MUL_UINT24, and MUL_INT24).  In order to test
 these patches with you will also need to apply the corresponding Mesa
 patches which will be on the mailing list soon.
 
 -Tom
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patches: Add support for the local address space

2013-06-26 Thread Tom Stellard
Hi Vincent,

Here is an updated version of patch #3.

-Tom

On Fri, Jun 14, 2013 at 08:35:03AM -0700, Vincent Lejeune wrote:
 Hi,
 
 Thank for your work on this !
 Patch 2, 4 and 5 have my rb.
 
 
 diff --git a/lib/Target/R600/R600InstrInfo.cpp 
 b/lib/Target/R600/R600InstrInfo.cpp
 index b9da74c..6de47f7 100644
 --- a/lib/Target/R600/R600InstrInfo.cpp
 +++ b/lib/Target/R600/R600InstrInfo.cpp
 @@ -133,6 +133,12 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
  bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
unsigned TargetFlags = get(Opcode).TSFlags; +  return (TargetFlags  
  R600_InstFlag::ALU_INST);
 +}
 +
 +bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
 +  unsigned TargetFlags = get(Opcode).TSFlags;
 +
return ((TargetFlags  R600_InstFlag::OP1) |
(TargetFlags  R600_InstFlag::OP2) |
(TargetFlags  R600_InstFlag::OP3));
 Function prototype is not defined here (it is defined in patch 5).
 
 
 
 diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
 b/lib/Target/R600/R600MachineScheduler.cpp
 index a330d88..acc1b4d 100644
 --- a/lib/Target/R600/R600MachineScheduler.cpp
 +++ b/lib/Target/R600/R600MachineScheduler.cpp
 @@ -269,10 +269,14 @@ R600SchedStrategy::AluKind 
 R600SchedStrategy::getAluKind(SUnit *SU) const {
  }
  
  // Does the instruction take a whole IG ?
 +// XXX: Is it possible to add a helper function in R600InstrInfo that 
 can
 +// be used here and in R600PacketizerList::isSoloInstruction() ?
  if(TII-isVector(*MI) ||
  TII-isCubeOp(MI-getOpcode()) ||
 -TII-isReductionOp(MI-getOpcode()))
 +TII-isReductionOp(MI-getOpcode()) ||
 +MI-getOpcode() == AMDGPU::GROUP_BARRIER) {
return AluT_XYZW;
 +}
 
 I'm not sure it'll factorize that much code ; R600Packetizer is called after 
 cube/reduction op are lowered
 by R600Expand pass and thus the isVector/ReductionOp check is useless. I may 
 have left some debug code in
 isSoloInstruction code though.
 
 
 
 - Mail original -
  De : Tom Stellard t...@stellard.net
  À : llvm-comm...@cs.uiuc.edu
  Cc : mesa-dev@lists.freedesktop.org
  Envoyé le : Jeudi 13 juin 2013 2h42
  Objet : [Mesa-dev] R600 Patches: Add support for the local address space
  
  Hi,
  
  The attached patches add support for local address space on
  Evergreen / Northern Islands GPUs.
  
  Please Review.
  
  -Tom
  
  ___
  mesa-dev mailing list
  mesa-dev@lists.freedesktop.org
  http://lists.freedesktop.org/mailman/listinfo/mesa-dev
 
From e5c9de74bcd7625b954aa3f070e4cb9a4b920c85 Mon Sep 17 00:00:00 2001
From: Tom Stellard thomas.stell...@amd.com
Date: Wed, 12 Jun 2013 09:02:39 -0700
Subject: [PATCH] R600: Add ALUInst bit to tablegen definitions v2

v2:
  - Remove functions left over from a previous rebase.
---
 lib/Target/R600/R600Defines.h   | 3 ++-
 lib/Target/R600/R600InstrFormats.td | 2 ++
 lib/Target/R600/R600InstrInfo.cpp   | 4 +---
 lib/Target/R600/R600Instructions.td | 3 +++
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index e30ea27..6bcf8ae 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -41,7 +41,8 @@ namespace R600_InstFlag {
 OP1 = (1  10),
 OP2 = (1  11),
 VTX_INST  = (1  12),
-TEX_INST = (1  13)
+TEX_INST = (1  13),
+ALU_INST = (1  14)
   };
 }
 
diff --git a/lib/Target/R600/R600InstrFormats.td 
b/lib/Target/R600/R600InstrFormats.td
index d31f18c..2c98fb9 100644
--- a/lib/Target/R600/R600InstrFormats.td
+++ b/lib/Target/R600/R600InstrFormats.td
@@ -26,6 +26,7 @@ class InstR600 dag outs, dag ins, string asm, listdag 
pattern,
   bit HasNativeOperands = 0;
   bit VTXInst = 0;
   bit TEXInst = 0;
+  bit ALUInst = 0;
 
   let Namespace = AMDGPU;
   let OutOperandList = outs;
@@ -47,6 +48,7 @@ class InstR600 dag outs, dag ins, string asm, listdag 
pattern,
   let TSFlags{11} = Op2;
   let TSFlags{12} = VTXInst;
   let TSFlags{13} = TEXInst;
+  let TSFlags{14} = ALUInst;
 }
 
 
//===--===//
diff --git a/lib/Target/R600/R600InstrInfo.cpp 
b/lib/Target/R600/R600InstrInfo.cpp
index d17425f..f267ee9 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -133,9 +133,7 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
 bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
   unsigned TargetFlags = get(Opcode).TSFlags;
 
-  return ((TargetFlags  R600_InstFlag::OP1) |
-  (TargetFlags  R600_InstFlag::OP2) |
-  (TargetFlags  R600_InstFlag::OP3));
+  return (TargetFlags  R600_InstFlag::ALU_INST);
 }
 
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
diff --git a/lib/Target/R600/R600Instructions.td 
b/lib/Target/R600/R600Instructions.td
index d819d44..b0a82ff 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600

Re: [Mesa-dev] R600: Expand integer operations for SI and consolidate code with EG

2013-06-21 Thread Tom Stellard
On Thu, Jun 20, 2013 at 06:43:38PM -0500, Aaron Watry wrote:
 This series is intended to bring SI closer to evergreen when it comes to
 support for v2i32/v4i32 integer operations.
 
 It adds support for expanding the following v2i32/v4i32 operations on SI:
 AND, MUL, OR, SHL, SRL, ASHR, UDIV, UREM, XOR
 
 Once that's done, the setOperationAction(op,type,Expand) calls that appear in
 both R600ISelLowering.cpp and SIISelLowering.cpp are all moved to
 AMDGPUISelLowering.cpp.  If we decide to implement these ops through native
 instructions for either target in the future, we can override that in the
 individual targets.
 
 Signed-off-by: Aaron Watry awa...@gmail.com

Just one small comment on the SHL patch, but with that fixed these
patches are:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

 
 R600/SI: Expand and of v2i32/v4i32 for SI
 R600/SI: Expand mul of v2i32/v4i32 for SI
 R600/SI: Expand or of v2i32/v4i32 for SI
 R600/SI: Expand shl of v2i32/v4i32 for SI
 R600/SI: Expand srl of v2i32/v4i32 for SI
 R600/SI: Expand ashr of v2i32/v4i32 for SI
 R600/SI: Expand udiv v[24]i32 for SI and v2i32 for EG
 R600/SI: Expand urem of v2i32/v4i32 for SI
 R600: Add v2i32 test for setcc on evergreen
 R600/SI: Expand xor v2i32/v4i32
 R600: Add v2i32 test for vselect
 R600: Consolidate expansion of v2i32/v4i32 ops for SI/EG
 
  lib/Target/R600/AMDGPUISelLowering.cpp | 22 
  lib/Target/R600/R600ISelLowering.cpp   | 18 -
  lib/Target/R600/SIISelLowering.cpp |  5 
  test/CodeGen/R600/and.ll   | 37 +-
  test/CodeGen/R600/mul.ll   | 38 ++-
  test/CodeGen/R600/or.ll| 41 -
  test/CodeGen/R600/setcc.ll | 25 +++---
  test/CodeGen/R600/shl.ll   | 47 
 ++
  test/CodeGen/R600/sra.ll   | 41 -
  test/CodeGen/R600/srl.ll   | 42 +-
  test/CodeGen/R600/udiv.ll  | 25 +++---
  test/CodeGen/R600/urem.ll  | 27 ---
  test/CodeGen/R600/vselect.ll   | 26 ++-
  test/CodeGen/R600/xor.ll   | 40 -
  14 files changed, 345 insertions(+), 89 deletions(-)
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600: Expand integer operations for SI and consolidate code with EG

2013-06-20 Thread Aaron Watry
This series is intended to bring SI closer to evergreen when it comes to
support for v2i32/v4i32 integer operations.

It adds support for expanding the following v2i32/v4i32 operations on SI:
AND, MUL, OR, SHL, SRL, ASHR, UDIV, UREM, XOR

Once that's done, the setOperationAction(op,type,Expand) calls that appear in
both R600ISelLowering.cpp and SIISelLowering.cpp are all moved to
AMDGPUISelLowering.cpp.  If we decide to implement these ops through native
instructions for either target in the future, we can override that in the
individual targets.

Signed-off-by: Aaron Watry awa...@gmail.com

R600/SI: Expand and of v2i32/v4i32 for SI
R600/SI: Expand mul of v2i32/v4i32 for SI
R600/SI: Expand or of v2i32/v4i32 for SI
R600/SI: Expand shl of v2i32/v4i32 for SI
R600/SI: Expand srl of v2i32/v4i32 for SI
R600/SI: Expand ashr of v2i32/v4i32 for SI
R600/SI: Expand udiv v[24]i32 for SI and v2i32 for EG
R600/SI: Expand urem of v2i32/v4i32 for SI
R600: Add v2i32 test for setcc on evergreen
R600/SI: Expand xor v2i32/v4i32
R600: Add v2i32 test for vselect
R600: Consolidate expansion of v2i32/v4i32 ops for SI/EG

 lib/Target/R600/AMDGPUISelLowering.cpp | 22 
 lib/Target/R600/R600ISelLowering.cpp   | 18 -
 lib/Target/R600/SIISelLowering.cpp |  5 
 test/CodeGen/R600/and.ll   | 37 +-
 test/CodeGen/R600/mul.ll   | 38 ++-
 test/CodeGen/R600/or.ll| 41 -
 test/CodeGen/R600/setcc.ll | 25 +++---
 test/CodeGen/R600/shl.ll   | 47 ++
 test/CodeGen/R600/sra.ll   | 41 -
 test/CodeGen/R600/srl.ll   | 42 +-
 test/CodeGen/R600/udiv.ll  | 25 +++---
 test/CodeGen/R600/urem.ll  | 27 ---
 test/CodeGen/R600/vselect.ll   | 26 ++-
 test/CodeGen/R600/xor.ll   | 40 -
 14 files changed, 345 insertions(+), 89 deletions(-)

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600/SI: Support for local memory and derivatives

2013-06-19 Thread Michel Dänzer

These patches implement enough of local memory support to allow radeonsi
to use that for computing derivatives, as suggested by Tom.

They also almost allow test/CodeGen/R600/local-memory.ll to generate
code for SI. Right now it still fails because it tries to copy a VGPR to
an SGPR, which is not possible.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
From f4ca359c4536aa53122b654196f2e007d50976f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Thu, 21 Feb 2013 16:12:45 +0100
Subject: [PATCH 1/6] R600/SI: Add intrinsics for texture sampling with user
 derivatives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 lib/Target/R600/SIInstructions.td | 7 ++-
 lib/Target/R600/SIIntrinsics.td   | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9c96c08..c9eac7d 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027;
 def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029;
-//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a;
+def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b;
 def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L;
 def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B;
@@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type {
   def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type;
   def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
   def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
+
+  def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
+  def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
+  def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
+  def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
 }
 
 defm : SamplePatternsv2i32;
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 224cd2f..d2643e0 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in {
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
   def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem];
-- 
1.8.3.1

From 7a0048bb2ab1b661831da2b764bf1a52f66bec15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Thu, 21 Feb 2013 18:51:38 +0100
Subject: [PATCH v3 2/6] R600/SI: Initial support for LDS/GDS instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---

v3: Drop vdst operand from DS_Store_Helper class, and adapt
SIInsertWaits::getHwCounts() to handle that. Unfortunately, this seems
to mess up the asm string output somehow, not sure what's going on
there.

 lib/Target/R600/SIInsertWaits.cpp  |  2 ++
 lib/Target/R600/SIInstrFormats.td  | 24 
 lib/Target/R600/SIInstrInfo.td | 23 +++
 lib/Target/R600/SIInstructions.td  |  3 +++
 lib/Target/R600/SILowerControlFlow.cpp | 16 
 5 files changed, 68 insertions(+)

diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index c36e1dc..d31da45 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -134,6 +134,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr MI) {
   if (TSFlags  SIInstrFlags::LGKM_CNT) {
 
 MachineOperand Op = MI.getOperand(0);
+if (!Op.isReg())
+  Op = MI.getOperand(1);
 assert(Op.isReg()  First LGKM operand must be a register!);
 
 unsigned Reg = Op.getReg();
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 51f323d..434aa7e 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -281,6 +281,30 @@ class VINTRP bits 2 op, dag outs, dag ins, string asm, listdag pattern :
 
 let Uses = [EXEC] in {
 
+class DS bits8 op, dag outs, dag ins, string asm, listdag pattern :
+Enc64 outs, ins, asm, pattern {
+
+  bits8 vdst;
+  bits1 gds;
+  bits8 addr;
+  bits8 data0;
+  bits8 data1;
+  bits8 offset0;
+  bits8 offset1;
+
+  let 

[Mesa-dev] R600: Bugfixes

2013-06-17 Thread Vincent Lejeune
Hi,

these patches fix 2 bugs in R600 backend.
The first one use the rv710/rv730 correct encoding for TEX clause with more 
than 8 instructions.
This bug has been spoted there : 

https://bugs.freedesktop.org/show_bug.cgi?id=64257
The other patch fix a typo that causes instructions not to use PV/PS register 
when R600Packetizers evaluates read port limitations.
It prevents some bundling opportunities in some (not so frequent) situation.

Vincent


0001-R600-Properly-set-COUNT_3-bit-in-TEX-clause-initiati.patch
Description: Binary data


0002-R600-PV-stores-Reg-id-not-index.patch
Description: Binary data
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600: Bugfixes

2013-06-17 Thread Alex Deucher
On Mon, Jun 17, 2013 at 9:43 AM, Vincent Lejeune v...@ovi.com wrote:
 Hi,

 these patches fix 2 bugs in R600 backend.
 The first one use the rv710/rv730 correct encoding for TEX clause with more 
 than 8 instructions.
 This bug has been spoted there :

 https://bugs.freedesktop.org/show_bug.cgi?id=64257
 The other patch fix a typo that causes instructions not to use PV/PS register 
 when R600Packetizers evaluates read port limitations.
 It prevents some bundling opportunities in some (not so frequent) situation.

Reviewed-by: Alex Deucher alexander.deuc...@amd.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600: Bugfixes

2013-06-17 Thread Tom Stellard
On Mon, Jun 17, 2013 at 06:43:09AM -0700, Vincent Lejeune wrote:
 Hi,
 
 these patches fix 2 bugs in R600 backend.
 The first one use the rv710/rv730 correct encoding for TEX clause with more 
 than 8 instructions.
 This bug has been spoted there : 
 
 https://bugs.freedesktop.org/show_bug.cgi?id=64257
 The other patch fix a typo that causes instructions not to use PV/PS register 
 when R600Packetizers evaluates read port limitations.
 It prevents some bundling opportunities in some (not so frequent) situation.
 
 Vincent

Both patches are Reviewed-by: Tom Stellard thomas.stell...@amd.com
Can you add the bugzilla link to the commit message of patch #1 ?

-Tom

 ___
 llvm-commits mailing list
 llvm-comm...@cs.uiuc.edu
 http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600: Various fixes for R600 and SI

2013-06-17 Thread Aaron Watry
First patch fixes load/store for v2i32 on R600. Without this, the
other two will cause make check failures.  I've verified the changes
using a Radeon 5400 (Cedar).  Note that the previous custom
lowering of v2i32 store was causing silent data corruption.

The other two patches expand add/sub on SI for both v2i32 and v4i32
types. There's lit tests for v2i32 that have been added.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patches: Add support for the local address space

2013-06-14 Thread Vincent Lejeune
Hi,

Thank for your work on this !
Patch 2, 4 and 5 have my rb.


diff --git a/lib/Target/R600/R600InstrInfo.cpp 
b/lib/Target/R600/R600InstrInfo.cpp
index b9da74c..6de47f7 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -133,6 +133,12 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
 bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
   unsigned TargetFlags = get(Opcode).TSFlags; +  return (TargetFlags  
 R600_InstFlag::ALU_INST);
+}
+
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
   return ((TargetFlags  R600_InstFlag::OP1) |
   (TargetFlags  R600_InstFlag::OP2) |
   (TargetFlags  R600_InstFlag::OP3));
Function prototype is not defined here (it is defined in patch 5).



diff --git a/lib/Target/R600/R600MachineScheduler.cpp 
b/lib/Target/R600/R600MachineScheduler.cpp
index a330d88..acc1b4d 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -269,10 +269,14 @@ R600SchedStrategy::AluKind 
R600SchedStrategy::getAluKind(SUnit *SU) const {
 }
 
 // Does the instruction take a whole IG ?
+// XXX: Is it possible to add a helper function in R600InstrInfo that can
+// be used here and in R600PacketizerList::isSoloInstruction() ?
 if(TII-isVector(*MI) ||
 TII-isCubeOp(MI-getOpcode()) ||
-TII-isReductionOp(MI-getOpcode()))
+TII-isReductionOp(MI-getOpcode()) ||
+MI-getOpcode() == AMDGPU::GROUP_BARRIER) {
   return AluT_XYZW;
+}

I'm not sure it'll factorize that much code ; R600Packetizer is called after 
cube/reduction op are lowered
by R600Expand pass and thus the isVector/ReductionOp check is useless. I may 
have left some debug code in
isSoloInstruction code though.



- Mail original -
 De : Tom Stellard t...@stellard.net
 À : llvm-comm...@cs.uiuc.edu
 Cc : mesa-dev@lists.freedesktop.org
 Envoyé le : Jeudi 13 juin 2013 2h42
 Objet : [Mesa-dev] R600 Patches: Add support for the local address space
 
 Hi,
 
 The attached patches add support for local address space on
 Evergreen / Northern Islands GPUs.
 
 Please Review.
 
 -Tom
 
 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev
 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patches: Add support for the local address space

2013-06-13 Thread Tom Stellard
On Wed, Jun 12, 2013 at 06:37:39PM -0700, Matt Arsenault wrote:
 On 06/12/2013 05:42 PM, Tom Stellard wrote:
 Hi,
 
 The attached patches add support for local address space on
 Evergreen / Northern Islands GPUs.
 
 Please Review.
 
 -Tom
  +  def int_AMDGPU_barrier_local  : Intrinsic[], [], [];
 You probably want to mark this as IntrReadMem to try to avoid
 reordering stores around the barrier


I don't think the intrinsic as defined will have stores reordered around
it.  From include/llvm/IR/Intrinsics.td:

// Intr*Mem - Memory properties.  An intrinsic is allowed to have at most one of
// these properties set.  They are listed from the most aggressive (best to use
// if correct) to the least aggressive.  If no property is set, the worst case
// is assumed (it may read and write any memory it can get access to and
// it may have other side effects).

-Tom
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patches: Add support for the local address space

2013-06-12 Thread Matt Arsenault

On 06/12/2013 05:42 PM, Tom Stellard wrote:

Hi,

The attached patches add support for local address space on
Evergreen / Northern Islands GPUs.

Please Review.

-Tom

 +  def int_AMDGPU_barrier_local  : Intrinsic[], [], [];
You probably want to mark this as IntrReadMem to try to avoid reordering 
stores around the barrier


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/SI: Intrinsics for derivatives

2013-06-10 Thread Michel Dänzer
On Sam, 2013-06-08 at 20:08 -0400, Tom Stellard wrote:
 On Fri, Jun 07, 2013 at 05:48:05PM -0700, Tom Stellard wrote:
  On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote:
   
   @@ -1544,6 +1562,26 @@ def : Pat 
   sub3)
;

   +class DDXY Intrinsic name, bits4 ldsdelta : Pat 
   +  (name v4f32:$src, imm, imm, imm),
   +  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 
   (IMPLICIT_DEF)),
   +(SI_DD (EXTRACT_SUBREG $src, sub0), (V_LSHLREV_B32_e32 2, (SI_TID)),
   +   (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
   +   ldsdelta), sub0),
   +(SI_DD (EXTRACT_SUBREG $src, sub1), (V_LSHLREV_B32_e32 2, (SI_TID)),
   +   (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
   +   ldsdelta), sub1),
   +(SI_DD (EXTRACT_SUBREG $src, sub2), (V_LSHLREV_B32_e32 2, (SI_TID)),
   +   (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
   +   ldsdelta), sub2),
   +(SI_DD (EXTRACT_SUBREG $src, sub3), (V_LSHLREV_B32_e32 2, (SI_TID)),
   +   (V_AND_B32_e32 0xfff0, (V_LSHLREV_B32_e32 2, (SI_TID))),
   +   ldsdelta), sub3)
   +;
  
  Based on this pattern, I don't think you need to use a ddx/ddy intrinsic
  here.  All of the instructions you are lowering DDX/DDY to have an
  equivalent LLVM IR instruction or LLVM intrinsic.
  
  For the DS_READ and DS_WRITE instructions all you need to do is emit
  load/stores to the local address space and then add patterns for those
  int the backend.  As an added bonus this will add support for OpenCL
  local address spaces. I think the rest of the instructions are pretty 
  straight
  forward (unless I've overlooked something).  Let me know if you have any
  questions.
 
 I did overlook something.  You will need to add an intrinsic for thread
 id in order to implement ddx/ddy completely in LLVM IR, but I still
 think it is the best way.

Shoot, I was just happy I finally got all the piglit tests passing. :)
But I agree your suggested approach would be better, I'll give it a go.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/SI: Intrinsics for derivatives

2013-06-08 Thread Tom Stellard
On Fri, Jun 07, 2013 at 05:48:05PM -0700, Tom Stellard wrote:
 On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote:
  
  The most important difference to the previous version of these is that
  whole quad mode is now enabled and M0 initialized appropriately for the
  LDS instructions, which now allows all of the relevant piglit tests to
  pass.
 
 
 Hi Michel,
 
 After I gave this series my r-b, I was reviewing your Mesa patches, and
 I suddenly had an idea for a better way to implement this.  See my
 comments below:
 
  From bb5adcd52cc5cadc308e85f635675199f5c02f35 Mon Sep 17 00:00:00 2001
  From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
  Date: Thu, 21 Feb 2013 17:56:22 +0100
  Subject: [PATCH 3/3] R600/SI: Support AMDGPU.ddx/y intrinsics
  MIME-Version: 1.0
  Content-Type: text/plain; charset=UTF-8
  Content-Transfer-Encoding: 8bit
  
  Use LDS for calculating the deltas between neighbouring pixels.
  
  Signed-off-by: Michel Dänzer michel.daen...@amd.com
  ---
   lib/Target/R600/SIISelLowering.cpp | 77 
  +-
   lib/Target/R600/SIISelLowering.h   |  6 +++
   lib/Target/R600/SIInstructions.td  | 42 -
   3 files changed, 121 insertions(+), 4 deletions(-)
  
  diff --git a/lib/Target/R600/SIISelLowering.cpp 
  b/lib/Target/R600/SIISelLowering.cpp
  index ac6a4c3..7ea226a 100644
  --- a/lib/Target/R600/SIISelLowering.cpp
  +++ b/lib/Target/R600/SIISelLowering.cpp
  @@ -249,7 +249,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   
   MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
   MachineInstr * MI, MachineBasicBlock * BB) const {
  -
  +  MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 MachineBasicBlock::iterator I = *MI;
   
 switch (MI-getOpcode()) {
  @@ -257,7 +257,6 @@ MachineBasicBlock * 
  SITargetLowering::EmitInstrWithCustomInserter(
   return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 case AMDGPU::BRANCH: return BB;
 case AMDGPU::SI_ADDR64_RSRC: {
  -MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
   unsigned SuperReg = MI-getOperand(0).getReg();
   unsigned SubRegLo = 
  MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
   unsigned SubRegHi = 
  MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
  @@ -282,10 +281,84 @@ MachineBasicBlock * 
  SITargetLowering::EmitInstrWithCustomInserter(
   MI-eraseFromParent();
   break;
 }
  +  case AMDGPU::SI_DD:
  +LowerSI_DD(MI, *BB, I, MRI);
  +break;
  +  case AMDGPU::SI_TID:
  +LowerSI_TID(MI, *BB, I, MRI);
  +break;
 }
 return BB;
   }
   
  +void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock BB,
  +MachineBasicBlock::iterator I, MachineRegisterInfo  MRI) const {
  +  unsigned coord0 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass);
  +  unsigned coord1 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass);
  +  MachineOperand dst = MI-getOperand(0);
  +  MachineOperand coord = MI-getOperand(1);
  +  MachineOperand ldsaddr = MI-getOperand(2);
  +  MachineOperand ldsaddr0 = MI-getOperand(3);
  +  MachineOperand ldsdelta = MI-getOperand(4);
  +
  +  // Write this thread's coordinate to LDS
  +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_WRITE_B32))
  +  .addOperand(coord)
  +  .addImm(0) // LDS
  +  .addOperand(ldsaddr)
  +  .addOperand(coord)
  +  .addOperand(coord)
  +  .addImm(0)
  +  .addImm(0);
  +
  +  // Read top right / bottom left thread's coordinate from LDS
  +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord0)
  +  .addImm(0) // LDS
  +  .addOperand(ldsaddr0)
  +  .addOperand(ldsaddr0)
  +  .addOperand(ldsaddr0)
  +  .addOperand(ldsdelta)
  +  .addImm(0);
  +
  +  // Read top left thread's coordinate from LDS
  +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord1)
  +  .addImm(0) // LDS
  +  .addOperand(ldsaddr0)
  +  .addOperand(ldsaddr0)
  +  .addOperand(ldsaddr0)
  +  .addImm(0)
  +  .addImm(0);
  +
  +  // Subtract top left coordinate from top right / bottom left
  +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_SUB_F32_e32))
  +  .addOperand(dst)
  +  .addReg(coord0)
  +  .addReg(coord1);
  +
  +  MI-eraseFromParent();
  +}
  +
  +void SITargetLowering::LowerSI_TID(MachineInstr *MI, MachineBasicBlock BB,
  +MachineBasicBlock::iterator I, MachineRegisterInfo  MRI) const {
  +  unsigned mbcnt_lo = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass);
  +  MachineOperand dst = MI-getOperand(0);
  +
  +  // Get this thread's ID
  +  BuildMI(BB, I, BB.findDebugLoc(I), 
  TII-get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo)
  +  .addImm(0x)
  +  .addImm(0)
  +  .addImm(0)
  +  .addImm(0)
  +  .addImm(0)
  +  

[Mesa-dev] R600/SI: Intrinsics for derivatives

2013-06-07 Thread Michel Dänzer

The most important difference to the previous version of these is that
whole quad mode is now enabled and M0 initialized appropriately for the
LDS instructions, which now allows all of the relevant piglit tests to
pass.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
From db07ab94113be5810fd6d1035b3d394ed53d27ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Thu, 21 Feb 2013 16:12:45 +0100
Subject: [PATCH 1/3] R600/SI: Add intrinsics for texture sampling with user
 derivatives
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 lib/Target/R600/SIInstructions.td | 7 ++-
 lib/Target/R600/SIIntrinsics.td   | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index b6db815..73f87ca 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -535,7 +535,7 @@ def IMAGE_SAMPLE_B : MIMG_Sampler_Helper 0x0025, IMAGE_SAMPLE_B;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ IMAGE_SAMPLE_LZ, 0x0027;
 def IMAGE_SAMPLE_C : MIMG_Sampler_Helper 0x0028, IMAGE_SAMPLE_C;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_CL, 0x0029;
-//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ IMAGE_SAMPLE_C_D, 0x002a;
+def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper 0x002a, IMAGE_SAMPLE_C_D;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ IMAGE_SAMPLE_C_D_CL, 0x002b;
 def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper 0x002c, IMAGE_SAMPLE_C_L;
 def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper 0x002d, IMAGE_SAMPLE_C_B;
@@ -1296,6 +1296,11 @@ multiclass SamplePatternsValueType addr_type {
   def : SampleArrayPattern int_SI_sampleb, IMAGE_SAMPLE_B, addr_type;
   def : SampleShadowPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
   def : SampleShadowArrayPattern int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type;
+
+  def : SamplePattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
+  def : SampleArrayPattern int_SI_sampled, IMAGE_SAMPLE_D, addr_type;
+  def : SampleShadowPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
+  def : SampleShadowArrayPattern int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type;
 }
 
 defm : SamplePatternsv2i32;
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 224cd2f..d2643e0 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -23,6 +23,7 @@ let TargetPrefix = SI, isTarget = 1 in {
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
   def int_SI_imageload : Intrinsic [llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem];
-- 
1.8.3

From 466936a680993dec58e1e537f3b489cd82b5176c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
Date: Thu, 21 Feb 2013 18:51:38 +0100
Subject: [PATCH 2/3] R600/SI: Initial support for LDS/GDS instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michel Dänzer michel.daen...@amd.com
---
 lib/Target/R600/SIInstrFormats.td  | 24 
 lib/Target/R600/SIInstrInfo.td | 23 +++
 lib/Target/R600/SIInstructions.td  |  3 +++
 lib/Target/R600/SILowerControlFlow.cpp | 16 
 4 files changed, 66 insertions(+)

diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 51f323d..434aa7e 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -281,6 +281,30 @@ class VINTRP bits 2 op, dag outs, dag ins, string asm, listdag pattern :
 
 let Uses = [EXEC] in {
 
+class DS bits8 op, dag outs, dag ins, string asm, listdag pattern :
+Enc64 outs, ins, asm, pattern {
+
+  bits8 vdst;
+  bits1 gds;
+  bits8 addr;
+  bits8 data0;
+  bits8 data1;
+  bits8 offset0;
+  bits8 offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{17} = gds;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+
+  let LGKM_CNT = 1;
+}
+
 class MUBUF bits7 op, dag outs, dag ins, string asm, listdag pattern :
 Enc64outs, ins, asm, pattern {
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 42fa95f..47a64f7 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -286,6 +286,29 @@ class VOP3_64 bits9 op, string opName, listdag pattern : VOP3 
 // Vector I/O classes
 //===--===//
 
+class DS_Load_Helper bits8 op, string asm, RegisterClass regClass : DS 
+  op,
+  (outs regClass:$vdst),
+  (ins 

Re: [Mesa-dev] R600/SI: Intrinsics for derivatives

2013-06-07 Thread Tom Stellard
On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote:
 
 The most important difference to the previous version of these is that
 whole quad mode is now enabled and M0 initialized appropriately for the
 LDS instructions, which now allows all of the relevant piglit tests to
 pass.
 


For the series:

Reviewed-by: Tom Stellard thomas.stell...@amd.com
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/SI: Intrinsics for derivatives

2013-06-07 Thread Tom Stellard
On Fri, Jun 07, 2013 at 05:24:42PM +0200, Michel Dänzer wrote:
 
 The most important difference to the previous version of these is that
 whole quad mode is now enabled and M0 initialized appropriately for the
 LDS instructions, which now allows all of the relevant piglit tests to
 pass.


Hi Michel,

After I gave this series my r-b, I was reviewing your Mesa patches, and
I suddenly had an idea for a better way to implement this.  See my
comments below:

 From bb5adcd52cc5cadc308e85f635675199f5c02f35 Mon Sep 17 00:00:00 2001
 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= michel.daen...@amd.com
 Date: Thu, 21 Feb 2013 17:56:22 +0100
 Subject: [PATCH 3/3] R600/SI: Support AMDGPU.ddx/y intrinsics
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 
 Use LDS for calculating the deltas between neighbouring pixels.
 
 Signed-off-by: Michel Dänzer michel.daen...@amd.com
 ---
  lib/Target/R600/SIISelLowering.cpp | 77 
 +-
  lib/Target/R600/SIISelLowering.h   |  6 +++
  lib/Target/R600/SIInstructions.td  | 42 -
  3 files changed, 121 insertions(+), 4 deletions(-)
 
 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index ac6a4c3..7ea226a 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -249,7 +249,7 @@ SDValue SITargetLowering::LowerFormalArguments(
  
  MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
  MachineInstr * MI, MachineBasicBlock * BB) const {
 -
 +  MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
MachineBasicBlock::iterator I = *MI;
  
switch (MI-getOpcode()) {
 @@ -257,7 +257,6 @@ MachineBasicBlock * 
 SITargetLowering::EmitInstrWithCustomInserter(
  return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH: return BB;
case AMDGPU::SI_ADDR64_RSRC: {
 -MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
  unsigned SuperReg = MI-getOperand(0).getReg();
  unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
  unsigned SubRegHi = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
 @@ -282,10 +281,84 @@ MachineBasicBlock * 
 SITargetLowering::EmitInstrWithCustomInserter(
  MI-eraseFromParent();
  break;
}
 +  case AMDGPU::SI_DD:
 +LowerSI_DD(MI, *BB, I, MRI);
 +break;
 +  case AMDGPU::SI_TID:
 +LowerSI_TID(MI, *BB, I, MRI);
 +break;
}
return BB;
  }
  
 +void SITargetLowering::LowerSI_DD(MachineInstr *MI, MachineBasicBlock BB,
 +MachineBasicBlock::iterator I, MachineRegisterInfo  MRI) const {
 +  unsigned coord0 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass);
 +  unsigned coord1 = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass);
 +  MachineOperand dst = MI-getOperand(0);
 +  MachineOperand coord = MI-getOperand(1);
 +  MachineOperand ldsaddr = MI-getOperand(2);
 +  MachineOperand ldsaddr0 = MI-getOperand(3);
 +  MachineOperand ldsdelta = MI-getOperand(4);
 +
 +  // Write this thread's coordinate to LDS
 +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_WRITE_B32))
 +  .addOperand(coord)
 +  .addImm(0) // LDS
 +  .addOperand(ldsaddr)
 +  .addOperand(coord)
 +  .addOperand(coord)
 +  .addImm(0)
 +  .addImm(0);
 +
 +  // Read top right / bottom left thread's coordinate from LDS
 +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord0)
 +  .addImm(0) // LDS
 +  .addOperand(ldsaddr0)
 +  .addOperand(ldsaddr0)
 +  .addOperand(ldsaddr0)
 +  .addOperand(ldsdelta)
 +  .addImm(0);
 +
 +  // Read top left thread's coordinate from LDS
 +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::DS_READ_B32), coord1)
 +  .addImm(0) // LDS
 +  .addOperand(ldsaddr0)
 +  .addOperand(ldsaddr0)
 +  .addOperand(ldsaddr0)
 +  .addImm(0)
 +  .addImm(0);
 +
 +  // Subtract top left coordinate from top right / bottom left
 +  BuildMI(BB, I, BB.findDebugLoc(I), TII-get(AMDGPU::V_SUB_F32_e32))
 +  .addOperand(dst)
 +  .addReg(coord0)
 +  .addReg(coord1);
 +
 +  MI-eraseFromParent();
 +}
 +
 +void SITargetLowering::LowerSI_TID(MachineInstr *MI, MachineBasicBlock BB,
 +MachineBasicBlock::iterator I, MachineRegisterInfo  MRI) const {
 +  unsigned mbcnt_lo = MRI.createVirtualRegister(AMDGPU::VReg_32RegClass);
 +  MachineOperand dst = MI-getOperand(0);
 +
 +  // Get this thread's ID
 +  BuildMI(BB, I, BB.findDebugLoc(I), 
 TII-get(AMDGPU::V_MBCNT_LO_U32_B32_e64), mbcnt_lo)
 +  .addImm(0x)
 +  .addImm(0)
 +  .addImm(0)
 +  .addImm(0)
 +  .addImm(0)
 +  .addImm(0);
 +  BuildMI(BB, I, BB.findDebugLoc(I), 
 TII-get(AMDGPU::V_MBCNT_HI_U32_B32_e32))
 +  .addOperand(dst)
 +  .addImm(0x)
 +  .addReg(mbcnt_lo);
 +
 +  

Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute

2013-05-17 Thread Michel Dänzer
On Mit, 2013-05-15 at 14:26 -0700, Tom Stellard wrote:
 
 The attached patches add some new patterns and instructions for SI and
 are a prerequisite for more invasive compute shader changes that I'm
 working on.
 
 Please Review.

The SI changes are

Reviewed-by: Michel Dänzer michel.daen...@amd.com


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast |  Debian, X and DRI developer
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute

2013-05-16 Thread Vincent Lejeune
Hi,


-- next part --
From dc547a89dac5039ce521f3c27fb23346251d488d Mon Sep 17 00:00:00 2001 From: 
Tom Stellard thomas.stellard at amd.com
Date: Tue, 7 May 2013 16:26:26 -0400
Subject: [PATCH 4/7] R600: Swap the legality of rotl and rotr

The hardware supports rotr and not rotl.
---
 lib/Target/R600/AMDGPUISelLowering.cpp |  3 +++
 lib/Target/R600/AMDGPUISelLowering.h   |  1 -
 lib/Target/R600/AMDGPUInstrInfo.td |  6 --
 lib/Target/R600/AMDGPUInstructions.td  |  6 ++
 lib/Target/R600/AMDILISelLowering.cpp  |  2 --
 lib/Target/R600/R600ISelLowering.cpp   | 15 ---
 lib/Target/R600/R600Instructions.td|  6 ++
 test/CodeGen/R600/rotr.ll  | 29 +
 8 files changed, 40 insertions(+), 28 deletions(-)
 create mode 100644 test/CodeGen/R600/rotr.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..b3c51e3 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -46,6 +46,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine 
TM) :
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
 
+  // The hardware supports ROTR, but not ROTL
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+
   // Lower floating point store/load to integer store/load to reduce the 
 number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
diff --git a/lib/Target/R600/AMDGPUISelLowering.h 
b/lib/Target/R600/AMDGPUISelLowering.h
index c2a79ea..6f8ab8b 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -115,7 +115,6 @@ enum {
   RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
-  BITALIGN,
   BUFFER_STORE,
   DWORDADDR,
   FRACT,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td 
b/lib/Target/R600/AMDGPUInstrInfo.td
index b66ae87..a0a3410 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile1, 3, [
 // AMDGPU DAG Nodes
 //
 
-// out = ((a  32) | b)  c)
-//
-// Can be used to optimize rtol:
-// rotl(a, b) = bitalign(a, a, 32 - b)
-def AMDGPUbitalign : SDNodeAMDGPUISD::BITALIGN, AMDGPUDTIntTernaryOp;
-
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNodeAMDGPUISD::DWORDADDR, SDTIntUnaryOp;
 
diff --git a/lib/Target/R600/AMDGPUInstructions.td 
b/lib/Target/R600/AMDGPUInstructions.td
index d2620b2..54df7d0 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -295,6 +295,12 @@ class BFEPattern Instruction BFE : Pat 
   (BFE $x, $y, $z)
 ;
 
+// rotr pattern
+class ROTRPattern Instruction BIT_ALIGN : Pat 
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+;
+
 include R600Instructions.td
 
 include SIInstrInfo.td
diff --git a/lib/Target/R600/AMDILISelLowering.cpp 
b/lib/Target/R600/AMDILISelLowering.cpp
index 922cac1..e20dbe0 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -138,8 +138,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
-// GPU doesn't have a rotl, rotr, or byteswap instruction
-setOperationAction(ISD::ROTR, VT, Expand);
 setOperationAction(ISD::BSWAP, VT, Expand);
 
 // GPU doesn't have any counting operators
diff --git a/lib/Target/R600/R600ISelLowering.cpp 
b/lib/Target/R600/R600ISelLowering.cpp
index 7252235..e58a8dd 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) :
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
 
-  setOperationAction(ISD::ROTL, MVT::i32, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -327,7 +325,6 @@ using namespace llvm::AMDGPUIntrinsic;
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) 
 const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::ROTL: return LowerROTL(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
@@ -518,18 +515,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, 
SelectionDAG DAG) const
   return DAG.getConstant(Offset * 4 * TFL-getStackWidth(MF), MVT::i32);
 }
 
-SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-
-  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
- Op.getOperand(0),
-

Re: [Mesa-dev] R600/SI Patches: A few cleanups for compute

2013-05-16 Thread Tom Stellard
On Thu, May 16, 2013 at 08:21:36AM -0700, Vincent Lejeune wrote:
 Hi,
 
 
 -- next part --
 From dc547a89dac5039ce521f3c27fb23346251d488d Mon Sep 17 00:00:00 2001 
 From: Tom Stellard thomas.stellard at amd.com
 Date: Tue, 7 May 2013 16:26:26 -0400
 Subject: [PATCH 4/7] R600: Swap the legality of rotl and rotr
 
 The hardware supports rotr and not rotl.
 ---
  lib/Target/R600/AMDGPUISelLowering.cpp |  3 +++
  lib/Target/R600/AMDGPUISelLowering.h   |  1 -
  lib/Target/R600/AMDGPUInstrInfo.td |  6 --
  lib/Target/R600/AMDGPUInstructions.td  |  6 ++
  lib/Target/R600/AMDILISelLowering.cpp  |  2 --
  lib/Target/R600/R600ISelLowering.cpp   | 15 ---
  lib/Target/R600/R600Instructions.td|  6 ++
  test/CodeGen/R600/rotr.ll  | 29 +
  8 files changed, 40 insertions(+), 28 deletions(-)
  create mode 100644 test/CodeGen/R600/rotr.ll
 
 diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
 b/lib/Target/R600/AMDGPUISelLowering.cpp
 index a266df5..b3c51e3 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.cpp
 +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
 @@ -46,6 +46,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine 
 TM) :
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FRINT,  MVT::f32, Legal);
  
 +  // The hardware supports ROTR, but not ROTL
 +  setOperationAction(ISD::ROTL, MVT::i32, Expand);
 +
// Lower floating point store/load to integer store/load to reduce the 
  number
// of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
 diff --git a/lib/Target/R600/AMDGPUISelLowering.h 
 b/lib/Target/R600/AMDGPUISelLowering.h
 index c2a79ea..6f8ab8b 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.h
 +++ b/lib/Target/R600/AMDGPUISelLowering.h
 @@ -115,7 +115,6 @@ enum {
RET_FLAG,
BRANCH_COND,
// End AMDIL ISD Opcodes
 -  BITALIGN,
BUFFER_STORE,
DWORDADDR,
FRACT,
 diff --git a/lib/Target/R600/AMDGPUInstrInfo.td 
 b/lib/Target/R600/AMDGPUInstrInfo.td
 index b66ae87..a0a3410 100644
 --- a/lib/Target/R600/AMDGPUInstrInfo.td
 +++ b/lib/Target/R600/AMDGPUInstrInfo.td
 @@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile1, 3, [
  // AMDGPU DAG Nodes
  //
  
 -// out = ((a  32) | b)  c)
 -//
 -// Can be used to optimize rtol:
 -// rotl(a, b) = bitalign(a, a, 32 - b)
 -def AMDGPUbitalign : SDNodeAMDGPUISD::BITALIGN, AMDGPUDTIntTernaryOp;
 -
  // This argument to this node is a dword address.
  def AMDGPUdwordaddr : SDNodeAMDGPUISD::DWORDADDR, SDTIntUnaryOp;
  
 diff --git a/lib/Target/R600/AMDGPUInstructions.td 
 b/lib/Target/R600/AMDGPUInstructions.td
 index d2620b2..54df7d0 100644
 --- a/lib/Target/R600/AMDGPUInstructions.td
 +++ b/lib/Target/R600/AMDGPUInstructions.td
 @@ -295,6 +295,12 @@ class BFEPattern Instruction BFE : Pat 
(BFE $x, $y, $z)
  ;
  
 +// rotr pattern
 +class ROTRPattern Instruction BIT_ALIGN : Pat 
 +  (rotr i32:$src0, i32:$src1),
 +  (BIT_ALIGN $src0, $src0, $src1)
 +;
 +
  include R600Instructions.td
  
  include SIInstrInfo.td
 diff --git a/lib/Target/R600/AMDILISelLowering.cpp 
 b/lib/Target/R600/AMDILISelLowering.cpp
 index 922cac1..e20dbe0 100644
 --- a/lib/Target/R600/AMDILISelLowering.cpp
 +++ b/lib/Target/R600/AMDILISelLowering.cpp
 @@ -138,8 +138,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
  setOperationAction(ISD::SMUL_LOHI, VT, Expand);
  setOperationAction(ISD::UMUL_LOHI, VT, Expand);
  
 -// GPU doesn't have a rotl, rotr, or byteswap instruction
 -setOperationAction(ISD::ROTR, VT, Expand);
  setOperationAction(ISD::BSWAP, VT, Expand);
  
  // GPU doesn't have any counting operators
 diff --git a/lib/Target/R600/R600ISelLowering.cpp 
 b/lib/Target/R600/R600ISelLowering.cpp
 index 7252235..e58a8dd 100644
 --- a/lib/Target/R600/R600ISelLowering.cpp
 +++ b/lib/Target/R600/R600ISelLowering.cpp
 @@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine TM) :
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  
 -  setOperationAction(ISD::ROTL, MVT::i32, Custom);
 -
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  
 @@ -327,7 +325,6 @@ using namespace llvm::AMDGPUIntrinsic;
  SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) 
  const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 -  case ISD::ROTL: return LowerROTL(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
 @@ -518,18 +515,6 @@ SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, 
 SelectionDAG DAG) const
return DAG.getConstant(Offset * 4 * TFL-getStackWidth(MF), MVT::i32);
  }
  
 -SDValue R600TargetLowering::LowerROTL(SDValue Op, 

[Mesa-dev] R600/SI Patches: A few cleanups for compute

2013-05-15 Thread Tom Stellard
Hi,

The attached patches add some new patterns and instructions for SI and
are a prerequisite for more invasive compute shader changes that I'm
working on.

Please Review.

-Tom
From 5b87402d1290df5ec8bdbe1333cadb5739a8c8bd Mon Sep 17 00:00:00 2001
From: Tom Stellard thomas.stell...@amd.com
Date: Mon, 13 May 2013 21:50:35 -0400
Subject: [PATCH 1/7] R600/SI: Make fitsRegClass() operands const

---
 lib/Target/R600/SIISelLowering.cpp | 2 +-
 lib/Target/R600/SIISelLowering.h   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index 6bd82a5..d7e2981 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -513,7 +513,7 @@ bool SITargetLowering::foldImm(SDValue Operand, int32_t 
Immediate,
 }
 
 /// \brief Does Op fit into register class RegClass ?
-bool SITargetLowering::fitsRegClass(SelectionDAG DAG, SDValue Op,
+bool SITargetLowering::fitsRegClass(SelectionDAG DAG, const SDValue Op,
 unsigned RegClass) const {
 
   MachineRegisterInfo MRI = DAG.getMachineFunction().getRegInfo(); 
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index de637be..e9ea68a 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -30,7 +30,8 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
   bool foldImm(SDValue Operand, int32_t Immediate,
bool ScalarSlotUsed) const;
-  bool fitsRegClass(SelectionDAG DAG, SDValue Op, unsigned RegClass) const;
+  bool fitsRegClass(SelectionDAG DAG, const SDValue Op,
+unsigned RegClass) const;
   void ensureSRegLimit(SelectionDAG DAG, SDValue Operand, 
unsigned RegClass, bool ScalarSlotUsed) const;
 
-- 
1.8.1.5

From a2d4b16a0022110c6198ed330966911b2bad3361 Mon Sep 17 00:00:00 2001
From: Tom Stellard thomas.stell...@amd.com
Date: Thu, 9 May 2013 16:44:22 -0400
Subject: [PATCH 2/7] R600/SI: Use the same names for VOP3 operands and
 encoding fields

This makes it possible to reorder the operands without breaking the
encoding.
---
 lib/Target/R600/SIInstrFormats.td | 62 +++
 lib/Target/R600/SIInstrInfo.td| 12 
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/lib/Target/R600/SIInstrFormats.td 
b/lib/Target/R600/SIInstrFormats.td
index f737ddd..51f323d 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -185,25 +185,25 @@ class VOP2 bits6 op, dag outs, dag ins, string asm, 
listdag pattern :
 class VOP3 bits9 op, dag outs, dag ins, string asm, listdag pattern :
 Enc64 outs, ins, asm, pattern {
 
-  bits8 VDST;
-  bits9 SRC0;
-  bits9 SRC1;
-  bits9 SRC2;
-  bits3 ABS; 
-  bits1 CLAMP;
-  bits2 OMOD;
-  bits3 NEG;
-
-  let Inst{7-0} = VDST;
-  let Inst{10-8} = ABS;
-  let Inst{11} = CLAMP;
+  bits8 dst;
+  bits9 src0;
+  bits9 src1;
+  bits9 src2;
+  bits3 abs;
+  bits1 clamp;
+  bits2 omod;
+  bits3 neg;
+
+  let Inst{7-0} = dst;
+  let Inst{10-8} = abs;
+  let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
   
   let mayLoad = 0;
   let mayStore = 0;
@@ -213,23 +213,23 @@ class VOP3 bits9 op, dag outs, dag ins, string asm, 
listdag pattern :
 class VOP3b bits9 op, dag outs, dag ins, string asm, listdag pattern :
 Enc64 outs, ins, asm, pattern {
 
-  bits8 VDST;
-  bits9 SRC0;
-  bits9 SRC1;
-  bits9 SRC2;
-  bits7 SDST;
-  bits2 OMOD;
-  bits3 NEG;
+  bits8 dst;
+  bits9 src0;
+  bits9 src1;
+  bits9 src2;
+  bits7 sdst;
+  bits2 omod;
+  bits3 neg;
 
-  let Inst{7-0} = VDST;
-  let Inst{14-8} = SDST;
+  let Inst{7-0} = dst;
+  let Inst{14-8} = sdst;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
 
   let mayLoad = 0;
   let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index c8aecb7..11c8f9d 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -163,8 +163,8 @@ multiclass VOP1_Helper bits8 op, RegisterClass drc, 
RegisterClass src,
  i32imm:$omod, i32imm:$neg),
 opName#_e64 $dst, $src0, $abs, $clamp, $omod, $neg, []
   , VOP opName {
-let SRC1 = SIOperand.ZERO;
-let SRC2 = SIOperand.ZERO;
+let src1 = SIOperand.ZERO;
+let src2 = SIOperand.ZERO;
   }
 }
 
@@ -189,7 +189,7 @@ multiclass VOP2_Helper bits6 op, RegisterClass 

Re: [Mesa-dev] R600: Various improvements

2013-05-14 Thread Tom Stellard


 From 8aa41148651150eb19332436c76fe490d4b54b1e Mon Sep 17 00:00:00 2001
 From: Vincent Lejeune v...@ovi.com
 Date: Sun, 12 May 2013 16:29:50 +0200
 Subject: [PATCH 1/2] R600: Rename 128 bit registers.
 
 Almost all instructions that takes a 128 bits reg as input (fetch, export...)
 have the abilities to swizzle their argument and output. Instead of printing
 default swizzle for each 128 bits reg, rename T*.XYZW to T* and let 
 instructions
 print potentially optimized swizzle themselve.

Typo here: swizzle themselve - swizzles themselves

Both patches are:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

 ---
  lib/Target/R600/R600Instructions.td  | 17 -
  lib/Target/R600/R600RegisterInfo.td  |  2 +-
  test/CodeGen/R600/llvm.AMDGPU.tex.ll | 32 
  3 files changed, 25 insertions(+), 26 deletions(-)
 
 diff --git a/lib/Target/R600/R600Instructions.td 
 b/lib/Target/R600/R600Instructions.td
 index 86e4b4a..abaa94b 100644
 --- a/lib/Target/R600/R600Instructions.td
 +++ b/lib/Target/R600/R600Instructions.td
 @@ -1750,8 +1750,7 @@ let usesCustomInserter = 1 in {
  
  class RAT_WRITE_CACHELESS_eg dag ins, bits4 comp_mask, string name,
listdag pattern
 -: EG_CF_RAT 0x57, 0x2, 0, (outs), ins,
 - !strconcat(name,  $rw_gpr, $index_gpr, $eop), pattern {
 +: EG_CF_RAT 0x57, 0x2, 0, (outs), ins, name, pattern {
let RIM = 0;
// XXX: Have a separate instruction for non-indexed writes.
let TYPE= 1;
 @@ -1771,19 +1770,19 @@ class RAT_WRITE_CACHELESS_eg dag ins, bits4 
 comp_mask, string name,
  // 32-bit store
  def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg 
(ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
 -  0x1, RAT_WRITE_CACHELESS_32_eg,
 +  0x1, RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop,
[(global_store i32:$rw_gpr, i32:$index_gpr)]
  ;
  
  //128-bit store
  def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg 
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
 -  0xf, RAT_WRITE_CACHELESS_128,
 +  0xf, RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop,
[(global_store v4i32:$rw_gpr, i32:$index_gpr)]
  ;
  
  class VTX_READ_eg string name, bits8 buffer_id, dag outs, listdag 
 pattern
 -: InstR600ISA outs, (ins MEMxi:$ptr), name# $dst, $ptr, pattern,
 +: InstR600ISA outs, (ins MEMxi:$ptr), name, pattern,
VTX_WORD1_GPR, VTX_WORD0 {
  
// Static fields
 @@ -1838,7 +1837,7 @@ class VTX_READ_eg string name, bits8 buffer_id, dag 
 outs, listdag pattern
  }
  
  class VTX_READ_8_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_8, buffer_id, (outs R600_TReg32_X:$dst),
 +: VTX_READ_eg VTX_READ_8 $dst, $ptr, buffer_id, (outs 
 R600_TReg32_X:$dst),
 pattern {
  
let MEGA_FETCH_COUNT = 1;
 @@ -1850,7 +1849,7 @@ class VTX_READ_8_eg bits8 buffer_id, listdag 
 pattern
  }
  
  class VTX_READ_16_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_16, buffer_id, (outs R600_TReg32_X:$dst),
 +: VTX_READ_eg VTX_READ_16 $dst, $ptr, buffer_id, (outs 
 R600_TReg32_X:$dst),
  pattern {
let MEGA_FETCH_COUNT = 2;
let DST_SEL_X = 0;
 @@ -1862,7 +1861,7 @@ class VTX_READ_16_eg bits8 buffer_id, listdag 
 pattern
  }
  
  class VTX_READ_32_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_32, buffer_id, (outs R600_TReg32_X:$dst),
 +: VTX_READ_eg VTX_READ_32 $dst, $ptr, buffer_id, (outs 
 R600_TReg32_X:$dst),
 pattern {
  
let MEGA_FETCH_COUNT = 4;
 @@ -1883,7 +1882,7 @@ class VTX_READ_32_eg bits8 buffer_id, listdag 
 pattern
  }
  
  class VTX_READ_128_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_128, buffer_id, (outs R600_Reg128:$dst),
 +: VTX_READ_eg VTX_READ_128 $dst.XYZW, $ptr, buffer_id, (outs 
 R600_Reg128:$dst),
 pattern {
  
let MEGA_FETCH_COUNT = 16;
 diff --git a/lib/Target/R600/R600RegisterInfo.td 
 b/lib/Target/R600/R600RegisterInfo.td
 index bfc546b..df6004b 100644
 --- a/lib/Target/R600/R600RegisterInfo.td
 +++ b/lib/Target/R600/R600RegisterInfo.td
 @@ -35,7 +35,7 @@ foreach Index = 0-127 in {
  Chan;
}
// 128-bit Temporary Registers
 -  def T#Index#_XYZW : R600Reg_128 T#Index#.XYZW,
 +  def T#Index#_XYZW : R600Reg_128 T#Index#,
 [!castRegister(T#Index#_X),
  !castRegister(T#Index#_Y),
  !castRegister(T#Index#_Z),
 diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll 
 b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
 index 74331fa..81fd43d 100644
 --- a/test/CodeGen/R600/llvm.AMDGPU.tex.ll
 +++ b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
 @@ -1,21 +1,21 @@
  ;RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck %s
  
 -;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 1
 

Re: [Mesa-dev] R600: Various improvements

2013-05-13 Thread Tom Stellard
On Sun, May 12, 2013 at 07:41:21AM -0700, Vincent Lejeune wrote:
 Hi,
 Patches 2 and 3 factorizes some code from the backend. Patch 3 should avoid 
 some recomputation too, which shouldn't hurt.
 Patch 4 and 5 rework how textures are handled in our backend. It replaces 
 TGSI like intrinsic (ie intrinsic that uses last argument as TextureTarget 
 which has no sense
 from hw pov) to intrinsic closer to hw. The pass could be done in mesa but I 
 rather have it in llvm for now to ensure backward compatibility with llvm 3.3.
 

Hi Vincent,

Just some small comments on patches 4 and 5.  With those changes,
this series is:

Reviewed-by: Tom Stellard thomas.stell...@amd.com

 From 3974315f153e67913f8cc4b4d52550bf6ab33e59 Mon Sep 17 00:00:00 2001
 From: Vincent Lejeune v...@ovi.com
 Date: Sun, 12 May 2013 16:29:50 +0200
 Subject: [PATCH 4/5] R600: Rename 128 bit registers.
 
 ---
  lib/Target/R600/R600Instructions.td | 17 -
  lib/Target/R600/R600RegisterInfo.td |  2 +-
  2 files changed, 9 insertions(+), 10 deletions(-)
 

What is the reason for renaming these registers?  Could you add an
explanation to the commit message?

 diff --git a/lib/Target/R600/R600Instructions.td 
 b/lib/Target/R600/R600Instructions.td
 index 86e4b4a..abaa94b 100644
 --- a/lib/Target/R600/R600Instructions.td
 +++ b/lib/Target/R600/R600Instructions.td
 @@ -1750,8 +1750,7 @@ let usesCustomInserter = 1 in {
  
  class RAT_WRITE_CACHELESS_eg dag ins, bits4 comp_mask, string name,
listdag pattern
 -: EG_CF_RAT 0x57, 0x2, 0, (outs), ins,
 - !strconcat(name,  $rw_gpr, $index_gpr, $eop), pattern {
 +: EG_CF_RAT 0x57, 0x2, 0, (outs), ins, name, pattern {
let RIM = 0;
// XXX: Have a separate instruction for non-indexed writes.
let TYPE= 1;
 @@ -1771,19 +1770,19 @@ class RAT_WRITE_CACHELESS_eg dag ins, bits4 
 comp_mask, string name,
  // 32-bit store
  def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg 
(ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
 -  0x1, RAT_WRITE_CACHELESS_32_eg,
 +  0x1, RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop,
[(global_store i32:$rw_gpr, i32:$index_gpr)]
  ;
  
  //128-bit store
  def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg 
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
 -  0xf, RAT_WRITE_CACHELESS_128,
 +  0xf, RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop,
[(global_store v4i32:$rw_gpr, i32:$index_gpr)]
  ;
  
  class VTX_READ_eg string name, bits8 buffer_id, dag outs, listdag 
 pattern
 -: InstR600ISA outs, (ins MEMxi:$ptr), name# $dst, $ptr, pattern,
 +: InstR600ISA outs, (ins MEMxi:$ptr), name, pattern,
VTX_WORD1_GPR, VTX_WORD0 {
  
// Static fields
 @@ -1838,7 +1837,7 @@ class VTX_READ_eg string name, bits8 buffer_id, dag 
 outs, listdag pattern
  }
  
  class VTX_READ_8_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_8, buffer_id, (outs R600_TReg32_X:$dst),
 +: VTX_READ_eg VTX_READ_8 $dst, $ptr, buffer_id, (outs 
 R600_TReg32_X:$dst),
 pattern {
  
let MEGA_FETCH_COUNT = 1;
 @@ -1850,7 +1849,7 @@ class VTX_READ_8_eg bits8 buffer_id, listdag 
 pattern
  }
  
  class VTX_READ_16_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_16, buffer_id, (outs R600_TReg32_X:$dst),
 +: VTX_READ_eg VTX_READ_16 $dst, $ptr, buffer_id, (outs 
 R600_TReg32_X:$dst),
  pattern {
let MEGA_FETCH_COUNT = 2;
let DST_SEL_X = 0;
 @@ -1862,7 +1861,7 @@ class VTX_READ_16_eg bits8 buffer_id, listdag 
 pattern
  }
  
  class VTX_READ_32_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_32, buffer_id, (outs R600_TReg32_X:$dst),
 +: VTX_READ_eg VTX_READ_32 $dst, $ptr, buffer_id, (outs 
 R600_TReg32_X:$dst),
 pattern {
  
let MEGA_FETCH_COUNT = 4;
 @@ -1883,7 +1882,7 @@ class VTX_READ_32_eg bits8 buffer_id, listdag 
 pattern
  }
  
  class VTX_READ_128_eg bits8 buffer_id, listdag pattern
 -: VTX_READ_eg VTX_READ_128, buffer_id, (outs R600_Reg128:$dst),
 +: VTX_READ_eg VTX_READ_128 $dst.XYZW, $ptr, buffer_id, (outs 
 R600_Reg128:$dst),
 pattern {
  
let MEGA_FETCH_COUNT = 16;
 diff --git a/lib/Target/R600/R600RegisterInfo.td 
 b/lib/Target/R600/R600RegisterInfo.td
 index bfc546b..df6004b 100644
 --- a/lib/Target/R600/R600RegisterInfo.td
 +++ b/lib/Target/R600/R600RegisterInfo.td
 @@ -35,7 +35,7 @@ foreach Index = 0-127 in {
  Chan;
}
// 128-bit Temporary Registers
 -  def T#Index#_XYZW : R600Reg_128 T#Index#.XYZW,
 +  def T#Index#_XYZW : R600Reg_128 T#Index#,
 [!castRegister(T#Index#_X),
  !castRegister(T#Index#_Y),
  !castRegister(T#Index#_Z),
 -- 
 1.8.2.1
 

 From 6840d3e3995283e98cd535db36ba24364f690072 Mon 

[Mesa-dev] R600: Expand vselect and SRA for v2i32 and v4i32 (v2)

2013-05-08 Thread Aaron Watry
These two patches fix a number of piglit OpenCL test failures on my
HD6850 (Barts).

There are no piglit CL test regressions and the llvm make check runs
without any unexpected failures.

v2: Add tests for v4i32 data type.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patchset: Emit true ISA

2013-05-06 Thread Tom Stellard
On Sat, May 04, 2013 at 09:09:25AM -0700, Vincent Lejeune wrote:
 Hi,
 
 Thank for doing this.
 Patches 1 2 and 3 have my rb.
 For patch 4:


Hi Vincent,

Attached is an updated version of patch 4.

-Tom
 
 @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const 
 MCInstrInfo MCII,
  
  void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS,
 SmallVectorImplMCFixup Fixups) 
  const {
 -  if (isFCOp(MI.getOpcode())){
 -EmitFCInstr(MI, OS);
 -  } else if (MI.getOpcode() == AMDGPU::RETURN ||
 +  if (MI.getOpcode() == AMDGPU::RETURN ||
  MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
  MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
  MI.getOpcode() == AMDGPU::BUNDLE ||
 @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
 MI, raw_ostream OS,
  return;
} else {
  switch(MI.getOpcode()) {
 -case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 -case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 -  uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
 -  Emit(inst, OS);
 -  break;
 -}
  case AMDGPU::CONSTANT_LOAD_eg:
  case AMDGPU::VTX_READ_PARAM_8_eg:
  case AMDGPU::VTX_READ_PARAM_16_eg:
 Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST 
 instead and to remove the switch() statement ?
 @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
 MI, raw_ostream OS,
Emit((u_int32_t) 0, OS);
break;
  } 
 - Mail original -
  De : Tom Stellard t...@stellard.net
  À : llvm-comm...@cs.uiuc.edu
  Cc : mesa-dev@lists.freedesktop.org
  Envoyé le : Samedi 4 mai 2013 0h53
  Objet : R600 Patchset: Emit true ISA
  
  Hi,
  
  The attached patches modify the CodeEmitter to emit true ISA.
  Previously, we were prefixing all instructions with an instruction type
  byte.
  
  Vincent did most of the work to convert the CodeEmitter to true ISA,
  these patches are just the last few cleanups that are needed to finish
  the project.
  
  Please test/review.
  
  Thanks,
  Tom
  
  ___
  llvm-commits mailing list
  llvm-comm...@cs.uiuc.edu
  http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
 
From 385d53cb2324e59ae91f1b632c789183e658c335 Mon Sep 17 00:00:00 2001
From: Tom Stellard thomas.stell...@amd.com
Date: Fri, 3 May 2013 15:27:23 -0700
Subject: [PATCH] R600: Remove dead code from the CodeEmitter v2

v2:
  - Replace switch statement with TSFlags query
---
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 456 +++--
 lib/Target/R600/R600Defines.h  |   3 +
 lib/Target/R600/R600InstrInfo.cpp  |   5 +-
 3 files changed, 64 insertions(+), 400 deletions(-)

diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 10dee20..271a974 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -26,9 +26,6 @@
 #include llvm/Support/raw_ostream.h
 #include stdio.h
 
-#define SRC_BYTE_COUNT 11
-#define DST_BYTE_COUNT 5
-
 using namespace llvm;
 
 namespace {
@@ -56,30 +53,14 @@ public:
  SmallVectorImplMCFixup Fixups) const;
 private:
 
-  void EmitALUInstr(const MCInst MI, SmallVectorImplMCFixup Fixups,
-raw_ostream OS) const;
-  void EmitSrc(const MCInst MI, unsigned OpIdx, raw_ostream OS) const;
-  void EmitSrcISA(const MCInst MI, unsigned RegOpIdx, unsigned SelOpIdx,
-raw_ostream OS) const;
-  void EmitDst(const MCInst MI, raw_ostream OS) const;
-  void EmitFCInstr(const MCInst MI, raw_ostream OS) const;
-
-  void EmitNullBytes(unsigned int byteCount, raw_ostream OS) const;
-
   void EmitByte(unsigned int byte, raw_ostream OS) const;
 
-  void EmitTwoBytes(uint32_t bytes, raw_ostream OS) const;
-
   void Emit(uint32_t value, raw_ostream OS) const;
   void Emit(uint64_t value, raw_ostream OS) const;
 
   unsigned getHWRegChan(unsigned reg) const;
   unsigned getHWReg(unsigned regNo) const;
 
-  bool isFCOp(unsigned opcode) const;
-  bool isTexOp(unsigned opcode) const;
-  bool isFlagSet(const MCInst MI, unsigned Operand, unsigned Flag) const;
-
 };
 
 } // End anonymous namespace
@@ -125,344 +106,82 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const 
MCInstrInfo MCII,
 
 void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS,
SmallVectorImplMCFixup Fixups) const 
{
-  if (isFCOp(MI.getOpcode())){
-EmitFCInstr(MI, OS);
-  } else if (MI.getOpcode() == AMDGPU::RETURN ||
+  const MCInstrDesc Desc = MCII.get(MI.getOpcode());
+  if (MI.getOpcode() == AMDGPU::RETURN ||
 MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
 MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
 MI.getOpcode() == AMDGPU::BUNDLE ||
 MI.getOpcode() == AMDGPU::KILL) {
 return;
-  } else {
-

Re: [Mesa-dev] R600 Patchset: Emit true ISA

2013-05-06 Thread Vincent Lejeune
Reviewed-by:Vincent Lejeunevljn at ovi.com


- Mail original -
 De : Tom Stellard t...@stellard.net
 À : Vincent Lejeune v...@ovi.com
 Cc : llvm-comm...@cs.uiuc.edu llvm-comm...@cs.uiuc.edu; 
 mesa-dev@lists.freedesktop.org mesa-dev@lists.freedesktop.org
 Envoyé le : Lundi 6 mai 2013 17h02
 Objet : Re: R600 Patchset: Emit true ISA
 
 On Sat, May 04, 2013 at 09:09:25AM -0700, Vincent Lejeune wrote:
  Hi,
 
  Thank for doing this.
  Patches 1 2 and 3 have my rb.
  For patch 4:
 
 
 Hi Vincent,
 
 Attached is an updated version of patch 4.
 
 -Tom
 
  @@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const 
 MCInstrInfo MCII,
   
   void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
 raw_ostream OS,
                                          SmallVectorImplMCFixup 
 Fixups) const {
  -  if (isFCOp(MI.getOpcode())){
  -    EmitFCInstr(MI, OS);
  -  } else if (MI.getOpcode() == AMDGPU::RETURN ||
  +  if (MI.getOpcode() == AMDGPU::RETURN ||
       MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
       MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
       MI.getOpcode() == AMDGPU::BUNDLE ||
  @@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const 
 MCInst MI, raw_ostream OS,
       return;
     } else {
       switch(MI.getOpcode()) {
  -    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
  -    case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
  -      uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
  -      Emit(inst, OS);
  -      break;
  -    }
       case AMDGPU::CONSTANT_LOAD_eg:
       case AMDGPU::VTX_READ_PARAM_8_eg:
       case AMDGPU::VTX_READ_PARAM_16_eg:
  Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST 
 instead and to remove the switch() statement ?
  @@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const 
 MCInst MI, raw_ostream OS,
         Emit((u_int32_t) 0, OS);
         break;
       } 
  - Mail original -
   De : Tom Stellard t...@stellard.net
   À : llvm-comm...@cs.uiuc.edu
   Cc : mesa-dev@lists.freedesktop.org
   Envoyé le : Samedi 4 mai 2013 0h53
   Objet : R600 Patchset: Emit true ISA
   
   Hi,
   
   The attached patches modify the CodeEmitter to emit true ISA.
   Previously, we were prefixing all instructions with an instruction 
 type
   byte.
   
   Vincent did most of the work to convert the CodeEmitter to true ISA,
   these patches are just the last few cleanups that are needed to finish
   the project.
   
   Please test/review.
   
   Thanks,
   Tom
   
   ___
   llvm-commits mailing list
   llvm-comm...@cs.uiuc.edu
   http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
  
 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600: Expand vselect and SRA for v2i32 and v4i32

2013-05-06 Thread Aaron Watry
These two patches fix a number of piglit OpenCL test failures on my
HD6850 (Barts).

There are no piglit CL test regressions and the llvm make check runs
without any unexpected failures.

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600: Expand vselect and SRA for v2i32 and v4i32

2013-05-06 Thread Tom Stellard
On Mon, May 06, 2013 at 07:35:42PM -0500, Aaron Watry wrote:
 These two patches fix a number of piglit OpenCL test failures on my
 HD6850 (Barts).
 
 There are no piglit CL test regressions and the llvm make check runs
 without any unexpected failures.


Hi Aaron,

These patches look good to me, but you should also add some lit tests
for these opcodes.  For R600, the tests are located in
test/CodeGen/R600.  A good example to look at is the srl.ll test, the
sra.ll test should be more or less the same.  fcmp-cnd.ll would
be a good example for the vselect.ll test as a vector version of it
should generate vselect nodes (though you'll want to make sure to use
i32 types instead of floats).

-Tom
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patchset: Emit true ISA

2013-05-04 Thread Vincent Lejeune
Hi,

Thank for doing this.
Patches 1 2 and 3 have my rb.
For patch 4:

@@ -125,9 +106,7 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const 
MCInstrInfo MCII,
 
 void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, raw_ostream OS,
SmallVectorImplMCFixup Fixups) 
 const {
-  if (isFCOp(MI.getOpcode())){
-EmitFCInstr(MI, OS);
-  } else if (MI.getOpcode() == AMDGPU::RETURN ||
+  if (MI.getOpcode() == AMDGPU::RETURN ||
 MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
 MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
 MI.getOpcode() == AMDGPU::BUNDLE ||
@@ -135,12 +114,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
MI, raw_ostream OS,
 return;
   } else {
 switch(MI.getOpcode()) {
-case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
-  uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
-  Emit(inst, OS);
-  break;
-}
 case AMDGPU::CONSTANT_LOAD_eg:
 case AMDGPU::VTX_READ_PARAM_8_eg:
 case AMDGPU::VTX_READ_PARAM_16_eg:
Is it possible to use R600_InstFlag::VTX_INST and R600_InstFlag::TEX_INST 
instead and to remove the switch() statement ?
@@ -234,44 +207,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 
MI, raw_ostream OS,
   Emit((u_int32_t) 0, OS);
   break;
 } 
- Mail original -
 De : Tom Stellard t...@stellard.net
 À : llvm-comm...@cs.uiuc.edu
 Cc : mesa-dev@lists.freedesktop.org
 Envoyé le : Samedi 4 mai 2013 0h53
 Objet : R600 Patchset: Emit true ISA
 
 Hi,
 
 The attached patches modify the CodeEmitter to emit true ISA.
 Previously, we were prefixing all instructions with an instruction type
 byte.
 
 Vincent did most of the work to convert the CodeEmitter to true ISA,
 these patches are just the last few cleanups that are needed to finish
 the project.
 
 Please test/review.
 
 Thanks,
 Tom
 
 ___
 llvm-commits mailing list
 llvm-comm...@cs.uiuc.edu
 http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] R600 Patchset: Emit true ISA

2013-05-04 Thread Aaron Watry
This series, and the associated mesa changes are all:
Tested-By: Aaron Watry awa...@gmail.com

--Aaron

On Fri, May 3, 2013 at 5:53 PM, Tom Stellard t...@stellard.net wrote:
 Hi,

 The attached patches modify the CodeEmitter to emit true ISA.
 Previously, we were prefixing all instructions with an instruction type
 byte.

 Vincent did most of the work to convert the CodeEmitter to true ISA,
 these patches are just the last few cleanups that are needed to finish
 the project.

 Please test/review.

 Thanks,
 Tom

 ___
 mesa-dev mailing list
 mesa-dev@lists.freedesktop.org
 http://lists.freedesktop.org/mailman/listinfo/mesa-dev

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 sb test results

2013-05-03 Thread Lauri Kasanen
On Fri, 03 May 2013 01:27:27 +0400
Vadim Girlin vadimgir...@gmail.com wrote:

 I'm almost sure that the same issue that you have with glxgears affects 
 your app too, so you might want to wait until we resolve the problem 
 with gears, possibly this will solve other rendering issues as well.

...

 By the way, I won't be very surprised if some old gcc release simply 
 fails at handling bitfields which are used to store both the keys of 
 shader variants in r600g and bytecode data in r600-sb (the same data 
 that ends up being broken in your glxgears dump), IIRC there were 
 bitfields-related bugs.

It's not a bug, but undefined behavior AFAIK. I sent a patch that fixes
the constant rebuilds (r600g: Correctly initialize the shader key).

With these currently pending patches applied, I get no more visual
distortion, and the fps improves a bit (28 - 32).

Just in case it'd be useful to you, here's the current sb,vs,ps output:
http://bayfiles.net/file/PnH3/9BRcGY/foo_shaders.gz

- Lauri
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600 Patchset: Emit true ISA

2013-05-03 Thread Tom Stellard
Hi,

The attached patches modify the CodeEmitter to emit true ISA.
Previously, we were prefixing all instructions with an instruction type
byte.

Vincent did most of the work to convert the CodeEmitter to true ISA,
these patches are just the last few cleanups that are needed to finish
the project.

Please test/review.

Thanks,
Tom
From 4fc6af0637de0eae0542a987e93d467bad3a4eee Mon Sep 17 00:00:00 2001
From: Tom Stellard thomas.stell...@amd.com
Date: Fri, 3 May 2013 11:17:18 -0700
Subject: [PATCH 1/4] R600: Emit ISA for CALL_FS_* instructions

---
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |  1 -
 test/CodeGen/R600/call_fs.ll   | 15 +++
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/R600/call_fs.ll

diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 7c83d86..8261477 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -262,7 +262,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
raw_ostream OS,
 }
 case AMDGPU::CF_CALL_FS_EG:
 case AMDGPU::CF_CALL_FS_R600:
-  return;
 case AMDGPU::CF_TC_EG:
 case AMDGPU::CF_VC_EG:
 case AMDGPU::CF_TC_R600:
diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll
new file mode 100644
index 000..fd21b72
--- /dev/null
+++ b/test/CodeGen/R600/call_fs.ll
@@ -0,0 +1,15 @@
+
+; RUN: llc  %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck 
--check-prefix=EG-CHECK %s
+; RUN: llc  %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck 
--check-prefix=R600-CHECK %s
+
+; EG-CHECK: @call_fs
+; EG-CHECK: CALL_FS  ; encoding: [0x03,0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
+; R600-CHECK: @call_fs
+; R600-CHECK:CALL_FS ; encoding: [0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
+
+
+define void @call_fs() #0 {
+  ret void
+}
+
+attributes #0 = { ShaderType=1 } ; Vertex Shader
-- 
1.7.11.4

From 24b983199b5c81eb0362f4c583eafb842255acc5 Mon Sep 17 00:00:00 2001
From: Tom Stellard thomas.stell...@amd.com
Date: Fri, 3 May 2013 11:38:33 -0700
Subject: [PATCH 2/4] R600: Stop emitting the instruction type byte before
 each instruction

---
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 35 ++
 test/CodeGen/R600/call_fs.ll   |  4 +--
 test/CodeGen/R600/cf_end.ll|  4 +--
 3 files changed, 6 insertions(+), 37 deletions(-)

diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 8261477..10dee20 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,12 +9,8 @@
 //
 /// \file
 ///
-/// This code emitter outputs bytecode that is understood by the r600g driver
-/// in the Mesa [1] project.  The bytecode is very similar to the hardware's 
ISA,
-/// but it still needs to be run through a finalizer in order to be executed
-/// by the GPU.
-///
-/// [1] http://www.mesa3d.org/
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
 //
 
//===--===//
 
@@ -95,16 +91,6 @@ enum RegElement {
   ELEMENT_W
 };
 
-enum InstrTypes {
-  INSTR_ALU = 0,
-  INSTR_TEX,
-  INSTR_FC,
-  INSTR_NATIVE,
-  INSTR_VTX,
-  INSTR_EXPORT,
-  INSTR_CFALU
-};
-
 enum FCInstr {
   FC_IF_PREDICATE = 0,
   FC_ELSE,
@@ -152,7 +138,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
raw_ostream OS,
 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
   uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
-  EmitByte(INSTR_NATIVE, OS);
   Emit(inst, OS);
   break;
 }
@@ -170,9 +155,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
raw_ostream OS,
   uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
   InstWord2 |= 1  19;
 
-  EmitByte(INSTR_NATIVE, OS);
   Emit(InstWord01, OS);
-  EmitByte(INSTR_NATIVE, OS);
   Emit(InstWord2, OS);
   Emit((u_int32_t) 0, OS);
   break;
@@ -246,9 +229,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
raw_ostream OS,
   SrcSelect[ELEMENT_W]  29 | Offsets[0]  0 | Offsets[1]  5 |
   Offsets[2]  10;
 
-  EmitByte(INSTR_NATIVE, OS);
   Emit(Word01, OS);
-  EmitByte(INSTR_NATIVE, OS);
   Emit(Word2, OS);
   Emit((u_int32_t) 0, OS);
   break;
@@ -256,7 +237,6 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
raw_ostream OS,
 case AMDGPU::CF_ALU:
 case AMDGPU::CF_ALU_PUSH_BEFORE: {
   uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
-  EmitByte(INSTR_NATIVE, OS);
   Emit(Inst, OS);
   break;
 }
@@ -289,13 +269,11 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst 

Re: [Mesa-dev] r600 sb test results

2013-05-02 Thread Lauri Kasanen
On Thu, 02 May 2013 00:45:13 +0400
Vadim Girlin vadimgir...@gmail.com wrote:

 On 05/01/2013 11:36 PM, Lauri Kasanen wrote:
  Now that it built, I could test your optimizations in my own apps.
  These are on current master 8eef6ad, on a RV710 (HD 4350 pci-e).
 
  In one of my private apps, using R600_DEBUG=sb caused regressions: FPS
  went from 28 to 7, the SSAO shader gave visual distortions/flicker, and
  the cpu was constantly pegged.
 
  Here's the output from R600_DEBUG=sb,sbstat in case it helps:
  http://bayfiles.net/file/Pmkh/PUj0Ru/vadim.gz
 
  It seems as if it's constantly handling new shaders? My app certainly
  issues no new shaders, they are all linked when the app starts.

 r600g may rebuild shaders at runtime because some GL features are 
 implemented in shader code, so if your app changes some specific GL 
 states (e.g. two-sided rendering mode), then r600g has to build and 
 switch between different shader variants.

It mainly uses the stencil buffer, the clear color is changed in
various passes, some occlusion queries with color masks, but nothing
exotic. New uniforms are of course sent each frame.

 On the other hand there is caching of shader variants in r600g 
 implemented specially to prevent repetitive rebuilding of shaders, looks 
 like it doesn't work in your case for some reason. Optimizations take 
 more time than rebuilding with default backend, that explains 
 performance regression.
 
 Could you provide some test app that reproduces these issues?

It's quite time-taking to cut it down, and apitraces of it in full are
several gigs (far too much to upload with my connection). I'll see if I
can get just the SSAO isolated, with minimal textures, to get a smaller
trace.

 Please also send me the dump with R600_DEBUG=sb,ps,vs, maybe I'll be 
 able to spot anything wrong there.

http://bayfiles.net/file/PmY5/xgIdlZ/foo.gz

  Let me know what you need to debug this.
 
  - Lauri
 
  PS: I'm not sure if this should be public or not, I think you're the
  only one working on it?
 
 Yes, I doubt that anyone else will work on it, on the other hand I think 
 reporting this on the list might help other users who will possibly hit 
 similar issues. Also at least in this case it looks rather like a 
 problem in r600g, so I'm cc'ing mesa-dev, r600-sb just made this issue 
 more noticeable because shader rebuilding with optimization requires 
 more time.

Using standard r600g, the cpu usage is less than 25% of one core, so
nothing was showing it was constantly rebuilding shaders. Is there some
way I could've found it was doing that, and if so, why?

- Lauri
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 sb test results

2013-05-02 Thread Vadim Girlin

On 05/02/2013 06:34 PM, Lauri Kasanen wrote:

On Thu, 02 May 2013 00:45:13 +0400
Vadim Girlin vadimgir...@gmail.com wrote:


On 05/01/2013 11:36 PM, Lauri Kasanen wrote:

Now that it built, I could test your optimizations in my own apps.
These are on current master 8eef6ad, on a RV710 (HD 4350 pci-e).

In one of my private apps, using R600_DEBUG=sb caused regressions: FPS
went from 28 to 7, the SSAO shader gave visual distortions/flicker, and
the cpu was constantly pegged.

Here's the output from R600_DEBUG=sb,sbstat in case it helps:
http://bayfiles.net/file/Pmkh/PUj0Ru/vadim.gz

It seems as if it's constantly handling new shaders? My app certainly
issues no new shaders, they are all linked when the app starts.


r600g may rebuild shaders at runtime because some GL features are
implemented in shader code, so if your app changes some specific GL
states (e.g. two-sided rendering mode), then r600g has to build and
switch between different shader variants.


It mainly uses the stencil buffer, the clear color is changed in
various passes, some occlusion queries with color masks, but nothing
exotic. New uniforms are of course sent each frame.


On the other hand there is caching of shader variants in r600g
implemented specially to prevent repetitive rebuilding of shaders, looks
like it doesn't work in your case for some reason. Optimizations take
more time than rebuilding with default backend, that explains
performance regression.

Could you provide some test app that reproduces these issues?


It's quite time-taking to cut it down, and apitraces of it in full are
several gigs (far too much to upload with my connection). I'll see if I
can get just the SSAO isolated, with minimal textures, to get a smaller
trace.


I'm almost sure that the same issue that you have with glxgears affects 
your app too, so you might want to wait until we resolve the problem 
with gears, possibly this will solve other rendering issues as well.





Please also send me the dump with R600_DEBUG=sb,ps,vs, maybe I'll be
able to spot anything wrong there.


http://bayfiles.net/file/PmY5/xgIdlZ/foo.gz


Let me know what you need to debug this.

- Lauri

PS: I'm not sure if this should be public or not, I think you're the
only one working on it?


Yes, I doubt that anyone else will work on it, on the other hand I think
reporting this on the list might help other users who will possibly hit
similar issues. Also at least in this case it looks rather like a
problem in r600g, so I'm cc'ing mesa-dev, r600-sb just made this issue
more noticeable because shader rebuilding with optimization requires
more time.


Using standard r600g, the cpu usage is less than 25% of one core, so
nothing was showing it was constantly rebuilding shaders. Is there some
way I could've found it was doing that, and if so, why?


You could run the app with R600_DEBUG=ps,vs (without sb) - it will 
also print the dump of every built shader. r600-sb doesn't affect the 
logic of shader rebuilding, it just processes the shaders when asked by 
r600g, so I think you'll see the same - a lot of built shaders. You 
could even try this with older mesa (before r600-sb was merged) to be sure.


As for the cause of rebuilding, I don't see any changes in the shaders 
in your dump that might be explained by state changes, it's exactly the 
same shaders rebuilt more than once, so far I don't know why. You might 
want to look into r600_shader_select function with debugger to see 
what's going wrong, it computes the key for required shader variant 
using r600_shader_selector_key, then looks at the list of variants to 
find already built shader with the same key, and builds a new one only 
if it can't find existing shader. Looks like something fails there.


By the way, I won't be very surprised if some old gcc release simply 
fails at handling bitfields which are used to store both the keys of 
shader variants in r600g and bytecode data in r600-sb (the same data 
that ends up being broken in your glxgears dump), IIRC there were 
bitfields-related bugs.


Vadim



- Lauri



___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600/sb binary constants

2013-05-01 Thread Lauri Kasanen
Hi list

The recently added r600 sb backend fails to build on GCC  4.3, since
it uses binary constants (0b0101).

Is the GCC version dependency intentional, or should the constants be
changed to int/hex?

- Lauri
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] r600-sb: glxgears wrong rendering

2013-05-01 Thread Lauri Kasanen
Hi

Running R600_DEBUG=sb glxgears on a RV710 gives wrong output:
http://i40.tinypic.com/t7gx09.png

This is on current master, git-8eef6ad.

Let me know what you need to debug this.

- Lauri
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600-sb: glxgears wrong rendering

2013-05-01 Thread Vadim Girlin

On 05/01/2013 11:42 PM, Lauri Kasanen wrote:

Hi

Running R600_DEBUG=sb glxgears on a RV710 gives wrong output:
http://i40.tinypic.com/t7gx09.png

This is on current master, git-8eef6ad.

Let me know what you need to debug this.


Please send me the output with R600_DEBUG=sb,ps,vs

Vadim
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] r600 sb test results

2013-05-01 Thread Vadim Girlin

On 05/01/2013 11:36 PM, Lauri Kasanen wrote:

Hi Vadim

Now that it built, I could test your optimizations in my own apps.
These are on current master 8eef6ad, on a RV710 (HD 4350 pci-e).

In one of my private apps, using R600_DEBUG=sb caused regressions: FPS
went from 28 to 7, the SSAO shader gave visual distortions/flicker, and
the cpu was constantly pegged.

Here's the output from R600_DEBUG=sb,sbstat in case it helps:
http://bayfiles.net/file/Pmkh/PUj0Ru/vadim.gz

It seems as if it's constantly handling new shaders? My app certainly
issues no new shaders, they are all linked when the app starts.


Hi,

r600g may rebuild shaders at runtime because some GL features are 
implemented in shader code, so if your app changes some specific GL 
states (e.g. two-sided rendering mode), then r600g has to build and 
switch between different shader variants.


On the other hand there is caching of shader variants in r600g 
implemented specially to prevent repetitive rebuilding of shaders, looks 
like it doesn't work in your case for some reason. Optimizations take 
more time than rebuilding with default backend, that explains 
performance regression.


Could you provide some test app that reproduces these issues?

Please also send me the dump with R600_DEBUG=sb,ps,vs, maybe I'll be 
able to spot anything wrong there.




Let me know what you need to debug this.

- Lauri

PS: I'm not sure if this should be public or not, I think you're the
only one working on it?


Yes, I doubt that anyone else will work on it, on the other hand I think 
reporting this on the list might help other users who will possibly hit 
similar issues. Also at least in this case it looks rather like a 
problem in r600g, so I'm cc'ing mesa-dev, r600-sb just made this issue 
more noticeable because shader rebuilding with optimization requires 
more time.


Vadim
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] R600 Patchset: Optimizations for bfgminer

2013-04-29 Thread Tom Stellard
Hi,

The attached patchset implements a few optimizations for the bfgminer
bitcoin mining program.

Please Review.

-Tom
From 661e832408a8bafc03a7c4c600c4a140b03054b4 Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov dcherkas...@gmail.com
Date: Thu, 7 Mar 2013 20:17:59 +0400
Subject: [PATCH 1/3] R600: Add 64-bit load/store support

* Added R600_Reg64 class
* Added T#Index#.XY registers definition
* Added v2i32 register reads from parameter and global space
* Added f32 and i32 elements extraction from v2f32 and v2i32
* Added v2i32 - v2f32 conversions

Signed-off-by: Dmitry Cherkassov dcherkas...@gmail.com

Tom Stellard:
  - Mark vec2 operations as expand.  The addition of a vec2 register
class made them all legal.
---
 lib/Target/R600/AMDGPUISelLowering.cpp |  6 +++
 lib/Target/R600/AMDILISelDAGToDAG.cpp  | 10 -
 lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |  3 ++
 lib/Target/R600/R600ISelLowering.cpp   | 17 +
 lib/Target/R600/R600InstrInfo.cpp  | 19 ++
 lib/Target/R600/R600Instructions.td| 44 ++
 lib/Target/R600/R600RegisterInfo.td| 16 
 test/CodeGen/R600/64bit-kernel-args.ll | 41 
 test/CodeGen/R600/fadd.ll  | 10 +
 test/CodeGen/R600/fdiv.ll  | 37 +-
 test/CodeGen/R600/fmul.ll  | 10 +
 test/CodeGen/R600/fp_to_sint.ll| 10 +
 test/CodeGen/R600/fp_to_uint.ll| 10 +
 test/CodeGen/R600/fsub.ll  | 20 +++---
 test/CodeGen/R600/setcc.ll | 18 +++--
 test/CodeGen/R600/sint_to_fp.ll| 10 +
 test/CodeGen/R600/udiv.ll  | 20 +++---
 test/CodeGen/R600/uint_to_fp.ll| 10 +
 test/CodeGen/R600/urem.ll  | 21 ---
 19 files changed, 292 insertions(+), 40 deletions(-)
 create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..4a064b1 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) 
:
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
@@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) 
:
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp 
b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index ba75a44..198cd7e 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -167,12 +167,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 if (ST.device()-getGeneration()  AMDGPUDeviceInfo::HD6XXX) {
   break;
 }
+unsigned RegSequenceClassID;
+EVT VT = N-getValueType(0);
+assert(VT.isVector());
+switch (VT.getVectorNumElements()) {
+case 4: RegSequenceClassID = AMDGPU::R600_Reg128RegClassID; break;
+case 2: RegSequenceClassID = AMDGPU::R600_Reg64RegClassID; break;
+default: llvm_unreachable(Unhandled vector width in BUILD_VECTOR);
+}
 // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
 // that adds a 128 bits reg copy when going through TwoAddressInstructions
 // pass. We want to avoid 128 bits copies as much as possible because they
 // can't be bundled by our scheduler.
 SDValue RegSeqArgs[9] = {
-  CurDAG-getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+  CurDAG-getTargetConstant(RegSequenceClassID, MVT::i32),
   SDValue(), CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32),
   SDValue(), CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32),
   SDValue(), CurDAG-getTargetConstant(AMDGPU::sub2, MVT::i32),
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp 
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 7c83d86..030fc87 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -150,6 +150,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst MI, 
raw_ostream OS,
   } else {
 switch(MI.getOpcode()) {
 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+

  1   2   >