Hello, I'm proposing the following patch as a step towards resolving the issue with inaccessibility of stack storage (.local memory) in PTX to other threads than the one using that stack. The idea is to have preallocated stacks, and have __nvptx_stacks[] array in shared memory hold current stack pointers. Each thread is maintaining __nvptx_stacks[tid.y] as its stack pointer, thus for OpenMP the intent is to preallocate on a per-warp basis (not per-thread). For OpenMP SIMD regions we'll have to ensure that conflicting accesses are not introduced.
I've exposed a new command-line option -msoft-stack to ease testing, but for OpenMP we'll have to automatically flip it based on function attributes. Right now it's not easy because OpenMP and OpenACC both use "omp declare target". Jakub, I seem to recall a discussion about OpenACC changing to use a separate attribute, but I cannot find it now. Any advice here? This approach also allows to implement alloca. However, to drop alloca-avoiding changes in libgomp we'd have to selectively enable -msoft-stack there, only for functions that OpenACC wouldn't use. I've run it through make -k check-c regtesting. These are new fails, all mysterious: +FAIL: gcc.c-torture/execute/20090113-2.c -O[123s] execution test Execution failure with invalid memory access. +FAIL: gcc.c-torture/execute/20090113-3.c -O[123s] execution test Times out (looping infinitely). The above two I had difficulties investigating due to cuda-gdb 7.0 not showing dissassembly for the misbehaving function. +FAIL: gcc.c-torture/execute/loop-15.c -O2 execution test Rather surprising and unclear failure due to branch stack overflow. There are also tests that now pass: +PASS: gcc.c-torture/execute/20020529-1.c -O0 execution test Used to fail with invalid memory access. +PASS: gcc.dg/sibcall-9.c execution test (not meaningful on NVPTX) +PASS: gcc.dg/torture/pr54261-1.c -O[0123s] execution test Atomic modification to stack variables now works. gcc/ * config/nvptx/nvptx.c (need_softstack_decl): Declare. (nvptx_declare_function_name): Handle TARGET_SOFT_STACK. (nvptx_output_return): Restore stack pointer if needed. (nvptx_file_end): Emit declaration of __nvptx_stacks. * config/nvptx/nvptx.opt (msoft-stack): New option. * doc/invoke.texi (-msoft-stack): Document. libgcc/ * config/nvptx/crt0.s (__nvptx_stacks): Define. (%__softstack): Define 128 KiB stack for -msoft-stack. (__main): Setup __nvptx_stacks. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 0204ad3..df915b9 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -114,6 +114,9 @@ static unsigned worker_red_align; #define worker_red_name "__worker_red" static GTY(()) rtx worker_red_sym; +/* True if any function references __nvptx_stacks. */ +static bool need_softstack_decl; + /* Allocate a new, cleared machine_function structure. */ static struct machine_function * @@ -689,15 +692,46 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) /* Declare a local variable for the frame. */ sz = get_frame_size (); - if (sz > 0 || cfun->machine->has_call_with_sc) + if (sz == 0 && cfun->machine->has_call_with_sc) + sz = 1; + if (sz > 0) { int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; - fprintf (file, "\t.reg.u%d %%frame;\n" - "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n", - BITS_PER_WORD, alignment, sz == 0 ? 1 : sz); - fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n", - BITS_PER_WORD); + fprintf (file, "\t.reg.u%d %%frame;\n", BITS_PER_WORD); + if (TARGET_SOFT_STACK) + { + /* Maintain 64-bit stack alignment. */ + int keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT; + sz = (sz + keep_align - 1) & ~(keep_align - 1); + int bits = BITS_PER_WORD; + fprintf (file, "\t.reg.u32 %%fstmp0;\n"); + fprintf (file, "\t.reg.u%d %%fstmp1;\n", bits); + fprintf (file, "\t.reg.u%d %%fstmp2;\n", bits); + fprintf (file, "\tmov.u32 %%fstmp0, %%tid.y;\n"); + fprintf (file, "\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n", + bits == 64 ? ".wide" : "", bits); + fprintf (file, "\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits); + /* fstmp2 = &__nvptx_stacks[tid.y]; */ + fprintf (file, "\tadd.u%d %%fstmp2, %%fstmp2, %%fstmp1;\n", bits); + fprintf (file, "\tld.shared.u%d %%fstmp1, [%%fstmp2];\n", bits); + fprintf (file, "\tsub.u%d %%frame, %%fstmp1, " + HOST_WIDE_INT_PRINT_DEC ";\n", bits, sz); + if (alignment > keep_align) + fprintf (file, "\tand.b%d %%frame, %%frame, %d;\n", + bits, -alignment); + if (!crtl->is_leaf) + fprintf (file, "\tst.shared.u%d [%%fstmp2], %%frame;\n", bits); + need_softstack_decl = true; + } + else + { + fprintf (file, "\t.local.align %d " + ".b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n", + alignment, sz); + fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n", + BITS_PER_WORD); + } } if (cfun->machine->has_call_with_varargs) @@ -734,6 +768,13 @@ nvptx_output_return (void) { machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode; + if (TARGET_SOFT_STACK + && !crtl->is_leaf + && (get_frame_size () > 0 || cfun->machine->has_call_with_sc)) + { + int bits = BITS_PER_WORD; + fprintf (asm_out_file, "\tst.shared.u%d [%%fstmp2], %%fstmp1;\n", bits); + } if (mode != VOIDmode) { mode = arg_promotion (mode); @@ -3278,6 +3319,11 @@ nvptx_file_end (void) worker_red_align, worker_red_name, worker_red_size); } + + if (need_softstack_decl) + { + fprintf (asm_out_file, ".extern .shared .u64 __nvptx_stacks[];\n;"); + } } /* Expander for the shuffle builtins. */ diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 8017046..7ab09b9 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -28,3 +28,7 @@ Generate code for a 64-bit ABI. mmainkernel Target Report RejectNegative Link in code for a __main kernel. + +msoft-stack +Target Report Mask(SOFT_STACK) +Use custom stacks instead of local memory for automatic storage. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 587e30e..6e45fb6 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -18935,6 +18935,13 @@ Generate code for 32-bit or 64-bit ABI. Link in code for a __main kernel. This is for stand-alone instead of offloading execution. +@item -msoft-stack +@opindex msoft-stack +Do not use @code{.local} memory for automatic storage. Instead, use pointer +in shared memory array @code{char *__nvptx_stacks[]} at position @code{tid.y} +as the stack pointer. This is for placing automatic variables into storage +that can be accessed from other threads, or modified with atomic instructions. + @end table @node PDP-11 Options diff --git a/libgcc/config/nvptx/crt0.s b/libgcc/config/nvptx/crt0.s index 38327ed..7a42e87 100644 --- a/libgcc/config/nvptx/crt0.s +++ b/libgcc/config/nvptx/crt0.s @@ -22,6 +22,9 @@ exit; } +.visible .shared .u64 __nvptx_stacks[1]; +.global .u64 %__softstack[16384]; + .extern .func (.param.u32 retval) main (.param.u32 argc, .param.u64 argv); .visible .entry __main (.param .u64 __retval, .param.u32 __argc, .param.u64 __argv) @@ -34,6 +37,12 @@ ld.param.u64 %rd0, [__retval]; st.global.u64 [%__exitval], %rd0; + .reg .u64 %stackptr; + mov.u64 %stackptr, %__softstack; + cvta.global.u64 %stackptr, %stackptr; + add.u64 %stackptr, %stackptr, 131072; + st.shared.u64 [__nvptx_stacks], %stackptr; + ld.param.u32 %r1, [__argc]; ld.param.u64 %rd1, [__argv]; st.param.u32 [%argc], %r1; -- 1.8.3.1