On Wed, 2 Dec 2015, Jakub Jelinek wrote: > Can you post sample code with assembly for -msoft-stack and -muniform-simt > showing how are short interesting cases expanded?
Here's short examples; please let me know if I'm misunderstanding and you wanted something else. First, -muniform-simt effect on this input: int f (int *p, int v) { return __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST); } leads to this assembly (showing diff -without/+with option): .visible .func (.param.u32 %out_retval)f(.param.u64 %in_ar1, .param.u32 %in_ar2) { .reg.u64 %ar1; .reg.u32 %ar2; .reg.u32 %retval; .reg.u64 %hr10; .reg.u32 %r23; .reg.u64 %r25; .reg.u32 %r26; + .reg.u32 %r28; + .reg.pred %r29; ld.param.u64 %ar1, [%in_ar1]; ld.param.u32 %ar2, [%in_ar2]; + { + .reg.u32 %ustmp0; + .reg.u64 %ustmp1; + .reg.u64 %ustmp2; + mov.u32 %ustmp0, %tid.y; + mul.wide.u32 %ustmp1, %ustmp0, 4; + mov.u64 %ustmp2, __nvptx_uni; + add.u64 %ustmp2, %ustmp2, %ustmp1; + ld.shared.u32 %r28, [%ustmp2]; + mov.u32 %ustmp0, %tid.x; + and.b32 %r28, %r28, %ustmp0; + setp.eq.u32 %r29, %r28, %ustmp0; + } mov.u64 %r25, %ar1; mov.u32 %r26, %ar2; - atom.exch.b32 %r23, [%r25], %r26; + @%r29 atom.exch.b32 %r23, [%r25], %r26; + shfl.idx.b32 %r23, %r23, %r28, 31; mov.u32 %retval, %r23; st.param.u32 [%out_retval], %retval; ret; } +// BEGIN GLOBAL VAR DECL: __nvptx_uni +.extern .shared .u32 __nvptx_uni[32]; And, -msoft-stack for this input: void g(void *); void f() { char a[42] __attribute__((aligned(64))); g(a); } leads to: .visible .func f { .reg.u64 %hr10; .reg.u64 %r22; .reg.u64 %frame; - .local.align 64 .b8 %farray[48]; - cvta.local.u64 %frame, %farray; + .reg.u32 %fstmp0; + .reg.u64 %fstmp1; + .reg.u64 %fstmp2; + mov.u32 %fstmp0, %tid.y; + mul.wide.u32 %fstmp1, %fstmp0, 8; + mov.u64 %fstmp2, __nvptx_stacks; + add.u64 %fstmp2, %fstmp2, %fstmp1; + ld.shared.u64 %fstmp1, [%fstmp2]; + sub.u64 %frame, %fstmp1, 48; + and.b64 %frame, %frame, -64; + st.shared.u64 [%fstmp2], %frame; mov.u64 %r22, %frame; { .param.u64 %out_arg0; st.param.u64 [%out_arg0], %r22; call g, (%out_arg0); } + st.shared.u64 [%fstmp2], %fstmp1; ret; } // BEGIN GLOBAL FUNCTION DECL: g .extern .func g(.param.u64 %in_ar1); +// BEGIN GLOBAL VAR DECL: __nvptx_stacks +.extern .shared .u64 __nvptx_stacks[32]; Alexander