On Wed, 2 Dec 2015, Jakub Jelinek wrote:
> Can you post sample code with assembly for -msoft-stack and -muniform-simt
> showing how are short interesting cases expanded?
Here's short examples; please let me know if I'm misunderstanding and you
wanted something else.
First, -muniform-simt effect on this input:
int f (int *p, int v)
{
return __atomic_exchange_n (p, v, __ATOMIC_SEQ_CST);
}
leads to this assembly (showing diff -without/+with option):
.visible .func (.param.u32 %out_retval)f(.param.u64 %in_ar1, .param.u32
%in_ar2)
{
.reg.u64 %ar1;
.reg.u32 %ar2;
.reg.u32 %retval;
.reg.u64 %hr10;
.reg.u32 %r23;
.reg.u64 %r25;
.reg.u32 %r26;
+ .reg.u32 %r28;
+ .reg.pred %r29;
ld.param.u64 %ar1, [%in_ar1];
ld.param.u32 %ar2, [%in_ar2];
+ {
+ .reg.u32 %ustmp0;
+ .reg.u64 %ustmp1;
+ .reg.u64 %ustmp2;
+ mov.u32 %ustmp0, %tid.y;
+ mul.wide.u32 %ustmp1, %ustmp0, 4;
+ mov.u64 %ustmp2, __nvptx_uni;
+ add.u64 %ustmp2, %ustmp2, %ustmp1;
+ ld.shared.u32 %r28, [%ustmp2];
+ mov.u32 %ustmp0, %tid.x;
+ and.b32 %r28, %r28, %ustmp0;
+ setp.eq.u32 %r29, %r28, %ustmp0;
+ }
mov.u64 %r25, %ar1;
mov.u32 %r26, %ar2;
- atom.exch.b32 %r23, [%r25], %r26;
+ @%r29 atom.exch.b32 %r23, [%r25], %r26;
+ shfl.idx.b32 %r23, %r23, %r28, 31;
mov.u32 %retval, %r23;
st.param.u32 [%out_retval], %retval;
ret;
}
+// BEGIN GLOBAL VAR DECL: __nvptx_uni
+.extern .shared .u32 __nvptx_uni[32];
And, -msoft-stack for this input:
void g(void *);
void f()
{
char a[42] __attribute__((aligned(64)));
g(a);
}
leads to:
.visible .func f
{
.reg.u64 %hr10;
.reg.u64 %r22;
.reg.u64 %frame;
- .local.align 64 .b8 %farray[48];
- cvta.local.u64 %frame, %farray;
+ .reg.u32 %fstmp0;
+ .reg.u64 %fstmp1;
+ .reg.u64 %fstmp2;
+ mov.u32 %fstmp0, %tid.y;
+ mul.wide.u32 %fstmp1, %fstmp0, 8;
+ mov.u64 %fstmp2, __nvptx_stacks;
+ add.u64 %fstmp2, %fstmp2, %fstmp1;
+ ld.shared.u64 %fstmp1, [%fstmp2];
+ sub.u64 %frame, %fstmp1, 48;
+ and.b64 %frame, %frame, -64;
+ st.shared.u64 [%fstmp2], %frame;
mov.u64 %r22, %frame;
{
.param.u64 %out_arg0;
st.param.u64 [%out_arg0], %r22;
call g, (%out_arg0);
}
+ st.shared.u64 [%fstmp2], %fstmp1;
ret;
}
// BEGIN GLOBAL FUNCTION DECL: g
.extern .func g(.param.u64 %in_ar1);
+// BEGIN GLOBAL VAR DECL: __nvptx_stacks
+.extern .shared .u64 __nvptx_stacks[32];
Alexander