On Wed, 2 Dec 2015, Jakub Jelinek wrote:
> Can you post sample code with assembly for -msoft-stack and -muniform-simt
> showing how are short interesting cases expanded?

Here's short examples;  please let me know if I'm misunderstanding and you
wanted something else.

First, -muniform-simt effect on this input:

int f (int *p, int v)
{
  return __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST);
}

leads to this assembly (showing diff -without/+with option):

 .visible .func (.param.u32 %out_retval)f(.param.u64 %in_ar1, .param.u32 
%in_ar2)
 {
        .reg.u64 %ar1;
        .reg.u32 %ar2;
        .reg.u32 %retval;
        .reg.u64 %hr10;
        .reg.u32 %r23;
        .reg.u64 %r25;
        .reg.u32 %r26;
+       .reg.u32 %r28;
+       .reg.pred %r29;
        ld.param.u64 %ar1, [%in_ar1];
        ld.param.u32 %ar2, [%in_ar2];
+       {
+               .reg.u32 %ustmp0;
+               .reg.u64 %ustmp1;
+               .reg.u64 %ustmp2;
+               mov.u32 %ustmp0, %tid.y;
+               mul.wide.u32 %ustmp1, %ustmp0, 4;
+               mov.u64 %ustmp2, __nvptx_uni;
+               add.u64 %ustmp2, %ustmp2, %ustmp1;
+               ld.shared.u32 %r28, [%ustmp2];
+               mov.u32 %ustmp0, %tid.x;
+               and.b32 %r28, %r28, %ustmp0;
+               setp.eq.u32 %r29, %r28, %ustmp0;
+       }
                mov.u64 %r25, %ar1;
                mov.u32 %r26, %ar2;
-               atom.exch.b32   %r23, [%r25], %r26;
+       @%r29   atom.exch.b32   %r23, [%r25], %r26;
+               shfl.idx.b32    %r23, %r23, %r28, 31;
                mov.u32 %retval, %r23;
        st.param.u32    [%out_retval], %retval;
        ret;
        }
+// BEGIN GLOBAL VAR DECL: __nvptx_uni
+.extern .shared .u32 __nvptx_uni[32];

And, -msoft-stack for this input:

void g(void *);
void f()
{
  char a[42] __attribute__((aligned(64)));
  g(a);
}

leads to:

 .visible .func f
 {
        .reg.u64 %hr10;
        .reg.u64 %r22;
        .reg.u64 %frame;
-       .local.align 64 .b8 %farray[48];
-       cvta.local.u64 %frame, %farray;
+       .reg.u32 %fstmp0;
+       .reg.u64 %fstmp1;
+       .reg.u64 %fstmp2;
+       mov.u32 %fstmp0, %tid.y;
+       mul.wide.u32 %fstmp1, %fstmp0, 8;
+       mov.u64 %fstmp2, __nvptx_stacks;
+       add.u64 %fstmp2, %fstmp2, %fstmp1;
+       ld.shared.u64 %fstmp1, [%fstmp2];
+       sub.u64 %frame, %fstmp1, 48;
+       and.b64 %frame, %frame, -64;
+       st.shared.u64 [%fstmp2], %frame;
                mov.u64 %r22, %frame;
        {
                .param.u64 %out_arg0;
                st.param.u64 [%out_arg0], %r22;
                call g, (%out_arg0);
        }
+       st.shared.u64 [%fstmp2], %fstmp1;
        ret;
        }
 // BEGIN GLOBAL FUNCTION DECL: g
 .extern .func g(.param.u64 %in_ar1);
+// BEGIN GLOBAL VAR DECL: __nvptx_stacks
+.extern .shared .u64 __nvptx_stacks[32];


Alexander

Reply via email to